Files
RLVR-math-7b-4gpu/trainer_state.json
ModelHub XC a268b8d790 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/RLVR-math-7b-4gpu
Source: Original Platform
2026-06-02 04:43:19 +08:00

7043 lines
253 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21333333333333335,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2957.0,
"completions/max_terminated_length": 2957.0,
"completions/mean_length": 518.421875,
"completions/mean_terminated_length": 535.1451416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.0010666666666666667,
"grad_norm": 0.01323013287037611,
"learning_rate": 2.5e-08,
"loss": 0.0198,
"num_tokens": 176812.0,
"reward": 0.248046875,
"reward_std": 0.3387196660041809,
"rewards/accuracy_reward": 0.14453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.3515625,
"rewards/mean_confidence_reward": 0.0,
"step": 1
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2622.0,
"completions/max_terminated_length": 2622.0,
"completions/mean_length": 517.49609375,
"completions/mean_terminated_length": 523.6324462890625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0021333333333333334,
"grad_norm": 0.012064808048307896,
"learning_rate": 5e-08,
"loss": 0.0167,
"num_tokens": 349091.0,
"reward": 0.271484375,
"reward_std": 0.31040170788764954,
"rewards/accuracy_reward": 0.1171875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.42578125,
"rewards/mean_confidence_reward": 0.0,
"step": 2
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 3046.0,
"completions/max_terminated_length": 3046.0,
"completions/mean_length": 530.734375,
"completions/mean_terminated_length": 541.3067626953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.0032,
"grad_norm": 0.02306981198489666,
"learning_rate": 7.5e-08,
"loss": 0.0145,
"num_tokens": 526727.0,
"reward": 0.212890625,
"reward_std": 0.29755842685699463,
"rewards/accuracy_reward": 0.109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.31640625,
"rewards/mean_confidence_reward": 0.0,
"step": 3
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3066.0,
"completions/max_terminated_length": 3066.0,
"completions/mean_length": 562.71484375,
"completions/mean_terminated_length": 571.6468505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.004266666666666667,
"grad_norm": 0.014729198068380356,
"learning_rate": 1e-07,
"loss": 0.011,
"num_tokens": 713462.0,
"reward": 0.23828125,
"reward_std": 0.32681804895401,
"rewards/accuracy_reward": 0.12109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.35546875,
"rewards/mean_confidence_reward": 0.0,
"step": 4
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2867.0,
"completions/max_terminated_length": 2867.0,
"completions/mean_length": 545.734375,
"completions/mean_terminated_length": 554.3968505859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.005333333333333333,
"grad_norm": 0.011580186896026134,
"learning_rate": 1.25e-07,
"loss": 0.008,
"num_tokens": 896370.0,
"reward": 0.236328125,
"reward_std": 0.32028326392173767,
"rewards/accuracy_reward": 0.1171875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.35546875,
"rewards/mean_confidence_reward": 0.0,
"step": 5
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2675.0,
"completions/max_terminated_length": 2675.0,
"completions/mean_length": 500.78125,
"completions/mean_terminated_length": 506.7193908691406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0064,
"grad_norm": 0.013362888246774673,
"learning_rate": 1.5e-07,
"loss": -0.0003,
"num_tokens": 1067034.0,
"reward": 0.2734375,
"reward_std": 0.3206136226654053,
"rewards/accuracy_reward": 0.14453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.40234375,
"rewards/mean_confidence_reward": 0.0,
"step": 6
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2821.0,
"completions/max_terminated_length": 2821.0,
"completions/mean_length": 575.6328125,
"completions/mean_terminated_length": 589.447998046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.007466666666666667,
"grad_norm": 0.016448985785245895,
"learning_rate": 1.75e-07,
"loss": 0.0048,
"num_tokens": 1258332.0,
"reward": 0.201171875,
"reward_std": 0.2759512960910797,
"rewards/accuracy_reward": 0.0859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.31640625,
"rewards/mean_confidence_reward": 0.0,
"step": 7
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2718.0,
"completions/max_terminated_length": 2718.0,
"completions/mean_length": 513.59375,
"completions/mean_terminated_length": 517.6378173828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.008533333333333334,
"grad_norm": 0.016916830092668533,
"learning_rate": 2e-07,
"loss": -0.0015,
"num_tokens": 1432836.0,
"reward": 0.23828125,
"reward_std": 0.3029829263687134,
"rewards/accuracy_reward": 0.11328125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.36328125,
"rewards/mean_confidence_reward": 0.0,
"step": 8
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2462.0,
"completions/max_terminated_length": 2462.0,
"completions/mean_length": 507.5859375,
"completions/mean_terminated_length": 511.5826721191406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0096,
"grad_norm": 0.011906522326171398,
"learning_rate": 2.25e-07,
"loss": 0.0006,
"num_tokens": 1606826.0,
"reward": 0.232421875,
"reward_std": 0.31943199038505554,
"rewards/accuracy_reward": 0.1171875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.34765625,
"rewards/mean_confidence_reward": 0.0,
"step": 9
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2551.0,
"completions/max_terminated_length": 2551.0,
"completions/mean_length": 565.359375,
"completions/mean_terminated_length": 574.3333740234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.010666666666666666,
"grad_norm": 0.014779274351894855,
"learning_rate": 2.5e-07,
"loss": -0.0072,
"num_tokens": 1794870.0,
"reward": 0.240234375,
"reward_std": 0.30318766832351685,
"rewards/accuracy_reward": 0.109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.37109375,
"rewards/mean_confidence_reward": 0.0,
"step": 10
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2968.0,
"completions/max_terminated_length": 2968.0,
"completions/mean_length": 594.7421875,
"completions/mean_terminated_length": 609.0160522460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.011733333333333333,
"grad_norm": 0.011167983524501324,
"learning_rate": 2.75e-07,
"loss": 0.0034,
"num_tokens": 1988116.0,
"reward": 0.26953125,
"reward_std": 0.3079420030117035,
"rewards/accuracy_reward": 0.12109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.41796875,
"rewards/mean_confidence_reward": 0.0,
"step": 11
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2587.0,
"completions/max_terminated_length": 2587.0,
"completions/mean_length": 500.71875,
"completions/mean_terminated_length": 510.6932373046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.0128,
"grad_norm": 0.01520876307040453,
"learning_rate": 3e-07,
"loss": 0.0096,
"num_tokens": 2156988.0,
"reward": 0.28125,
"reward_std": 0.33993446826934814,
"rewards/accuracy_reward": 0.16796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.39453125,
"rewards/mean_confidence_reward": 0.0,
"step": 12
},
{
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.00390625,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2975.0,
"completions/max_terminated_length": 2975.0,
"completions/mean_length": 490.8828125,
"completions/mean_terminated_length": 498.67462158203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.013866666666666666,
"grad_norm": 0.018569672480225563,
"learning_rate": 3.25e-07,
"loss": -0.0111,
"num_tokens": 2323758.0,
"reward": 0.3125,
"reward_std": 0.36210939288139343,
"rewards/accuracy_reward": 0.16796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.45703125,
"rewards/mean_confidence_reward": 0.0,
"step": 13
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2852.0,
"completions/max_terminated_length": 2852.0,
"completions/mean_length": 542.48046875,
"completions/mean_terminated_length": 548.9130859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.014933333333333333,
"grad_norm": 0.019627684727311134,
"learning_rate": 3.5e-07,
"loss": 0.0018,
"num_tokens": 2504545.0,
"reward": 0.2890625,
"reward_std": 0.34555116295814514,
"rewards/accuracy_reward": 0.1640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.4140625,
"rewards/mean_confidence_reward": 0.0,
"step": 14
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2786.0,
"completions/max_terminated_length": 2786.0,
"completions/mean_length": 466.3203125,
"completions/mean_terminated_length": 475.6095886230469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.016,
"grad_norm": 0.017364244908094406,
"learning_rate": 3.75e-07,
"loss": -0.007,
"num_tokens": 2668315.0,
"reward": 0.279296875,
"reward_std": 0.3283451199531555,
"rewards/accuracy_reward": 0.10546875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.453125,
"rewards/mean_confidence_reward": 0.0,
"step": 15
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2836.0,
"completions/max_terminated_length": 2836.0,
"completions/mean_length": 585.20703125,
"completions/mean_terminated_length": 594.49609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.017066666666666667,
"grad_norm": 0.025343673303723335,
"learning_rate": 4e-07,
"loss": -0.0114,
"num_tokens": 2863488.0,
"reward": 0.26171875,
"reward_std": 0.3317730128765106,
"rewards/accuracy_reward": 0.13671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.38671875,
"rewards/mean_confidence_reward": 0.0,
"step": 16
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2510.0,
"completions/max_terminated_length": 2510.0,
"completions/mean_length": 539.87109375,
"completions/mean_terminated_length": 550.62548828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.018133333333333335,
"grad_norm": 0.10374850779771805,
"learning_rate": 4.2499999999999995e-07,
"loss": 0.028,
"num_tokens": 3041735.0,
"reward": 0.2578125,
"reward_std": 0.31055694818496704,
"rewards/accuracy_reward": 0.13671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.37890625,
"rewards/mean_confidence_reward": 0.0,
"step": 17
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2617.0,
"completions/max_terminated_length": 2617.0,
"completions/mean_length": 496.09765625,
"completions/mean_terminated_length": 505.9801025390625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0192,
"grad_norm": 0.015069372020661831,
"learning_rate": 4.5e-07,
"loss": -0.0015,
"num_tokens": 3215968.0,
"reward": 0.212890625,
"reward_std": 0.2631013095378876,
"rewards/accuracy_reward": 0.0625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.36328125,
"rewards/mean_confidence_reward": 0.0,
"step": 18
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2801.0,
"completions/max_terminated_length": 2801.0,
"completions/mean_length": 510.16796875,
"completions/mean_terminated_length": 512.1686401367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.020266666666666665,
"grad_norm": 0.042982570827007294,
"learning_rate": 4.7499999999999995e-07,
"loss": 0.0026,
"num_tokens": 3387843.0,
"reward": 0.31640625,
"reward_std": 0.3324541449546814,
"rewards/accuracy_reward": 0.13671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.49609375,
"rewards/mean_confidence_reward": 0.0,
"step": 19
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2615.0,
"completions/max_terminated_length": 2615.0,
"completions/mean_length": 445.1328125,
"completions/mean_terminated_length": 455.8160095214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.021333333333333333,
"grad_norm": 0.029637709259986877,
"learning_rate": 5e-07,
"loss": 0.0207,
"num_tokens": 3543181.0,
"reward": 0.365234375,
"reward_std": 0.32044434547424316,
"rewards/accuracy_reward": 0.1796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.55078125,
"rewards/mean_confidence_reward": 0.0,
"step": 20
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2937.0,
"completions/max_terminated_length": 2937.0,
"completions/mean_length": 491.52734375,
"completions/mean_terminated_length": 493.4549255371094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.0224,
"grad_norm": 0.03345920518040657,
"learning_rate": 5.25e-07,
"loss": 0.0163,
"num_tokens": 3708484.0,
"reward": 0.373046875,
"reward_std": 0.34012746810913086,
"rewards/accuracy_reward": 0.19921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.546875,
"rewards/mean_confidence_reward": 0.0,
"step": 21
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2832.0,
"completions/max_terminated_length": 2832.0,
"completions/mean_length": 415.35546875,
"completions/mean_terminated_length": 418.6259765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.023466666666666667,
"grad_norm": 0.016323313117027283,
"learning_rate": 5.5e-07,
"loss": 0.0116,
"num_tokens": 3853143.0,
"reward": 0.40625,
"reward_std": 0.33967095613479614,
"rewards/accuracy_reward": 0.18359375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.62890625,
"rewards/mean_confidence_reward": 0.0,
"step": 22
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2206.0,
"completions/max_terminated_length": 2206.0,
"completions/mean_length": 428.2421875,
"completions/mean_terminated_length": 431.6141662597656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.024533333333333334,
"grad_norm": 0.017756735906004906,
"learning_rate": 5.749999999999999e-07,
"loss": -0.0044,
"num_tokens": 4003221.0,
"reward": 0.38671875,
"reward_std": 0.325067937374115,
"rewards/accuracy_reward": 0.1796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.59375,
"rewards/mean_confidence_reward": 0.0,
"step": 23
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2367.0,
"completions/max_terminated_length": 2367.0,
"completions/mean_length": 380.55078125,
"completions/mean_terminated_length": 386.5912780761719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0256,
"grad_norm": 0.01651872880756855,
"learning_rate": 6e-07,
"loss": 0.0003,
"num_tokens": 4141666.0,
"reward": 0.462890625,
"reward_std": 0.2878950238227844,
"rewards/accuracy_reward": 0.1875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.73828125,
"rewards/mean_confidence_reward": 0.0,
"step": 24
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2493.0,
"completions/max_terminated_length": 2493.0,
"completions/mean_length": 406.40234375,
"completions/mean_terminated_length": 409.60235595703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.02666666666666667,
"grad_norm": 0.011909977532923222,
"learning_rate": 6.249999999999999e-07,
"loss": 0.0109,
"num_tokens": 4285441.0,
"reward": 0.44921875,
"reward_std": 0.33901095390319824,
"rewards/accuracy_reward": 0.20703125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.69140625,
"rewards/mean_confidence_reward": 0.0,
"step": 25
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2847.0,
"completions/max_terminated_length": 2847.0,
"completions/mean_length": 373.5234375,
"completions/mean_terminated_length": 379.452392578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.027733333333333332,
"grad_norm": 0.012188470922410488,
"learning_rate": 6.5e-07,
"loss": -0.0005,
"num_tokens": 4422815.0,
"reward": 0.44140625,
"reward_std": 0.31855690479278564,
"rewards/accuracy_reward": 0.16796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.71484375,
"rewards/mean_confidence_reward": 0.0,
"step": 26
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2034.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 385.44921875,
"completions/mean_terminated_length": 388.4842529296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0288,
"grad_norm": 0.013420060276985168,
"learning_rate": 6.75e-07,
"loss": -0.0008,
"num_tokens": 4563218.0,
"reward": 0.466796875,
"reward_std": 0.2929942011833191,
"rewards/accuracy_reward": 0.171875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.76171875,
"rewards/mean_confidence_reward": 0.0,
"step": 27
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2504.0,
"completions/max_terminated_length": 2504.0,
"completions/mean_length": 397.69140625,
"completions/mean_terminated_length": 399.2510070800781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.029866666666666666,
"grad_norm": 0.008218493312597275,
"learning_rate": 7e-07,
"loss": 0.0083,
"num_tokens": 4708483.0,
"reward": 0.517578125,
"reward_std": 0.29791176319122314,
"rewards/accuracy_reward": 0.25,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.78515625,
"rewards/mean_confidence_reward": 0.0,
"step": 28
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2354.0,
"completions/max_terminated_length": 2354.0,
"completions/mean_length": 399.60546875,
"completions/mean_terminated_length": 402.751953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.030933333333333334,
"grad_norm": 0.008327974937856197,
"learning_rate": 7.249999999999999e-07,
"loss": -0.0067,
"num_tokens": 4854422.0,
"reward": 0.521484375,
"reward_std": 0.284969687461853,
"rewards/accuracy_reward": 0.22265625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.8203125,
"rewards/mean_confidence_reward": 0.0,
"step": 29
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2147.0,
"completions/max_terminated_length": 2147.0,
"completions/mean_length": 391.87890625,
"completions/mean_terminated_length": 396.52569580078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.032,
"grad_norm": 0.012601979076862335,
"learning_rate": 7.5e-07,
"loss": 0.005,
"num_tokens": 4998239.0,
"reward": 0.52734375,
"reward_std": 0.2591574192047119,
"rewards/accuracy_reward": 0.20703125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.84765625,
"rewards/mean_confidence_reward": 0.0,
"step": 30
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1965.0,
"completions/max_terminated_length": 1965.0,
"completions/mean_length": 351.66015625,
"completions/mean_terminated_length": 353.03924560546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.03306666666666667,
"grad_norm": 0.006865192670375109,
"learning_rate": 7.75e-07,
"loss": 0.0002,
"num_tokens": 5130688.0,
"reward": 0.529296875,
"reward_std": 0.20012958347797394,
"rewards/accuracy_reward": 0.15625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.90234375,
"rewards/mean_confidence_reward": 0.0,
"step": 31
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1345.0,
"completions/max_terminated_length": 1345.0,
"completions/mean_length": 363.34765625,
"completions/mean_terminated_length": 363.34765625,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.034133333333333335,
"grad_norm": 0.00856697279959917,
"learning_rate": 8e-07,
"loss": 0.0025,
"num_tokens": 5266921.0,
"reward": 0.552734375,
"reward_std": 0.2204914540052414,
"rewards/accuracy_reward": 0.2109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.89453125,
"rewards/mean_confidence_reward": 0.0,
"step": 32
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1296.0,
"completions/max_terminated_length": 1296.0,
"completions/mean_length": 347.5234375,
"completions/mean_terminated_length": 347.5234375,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.0352,
"grad_norm": 0.013602660968899727,
"learning_rate": 8.249999999999999e-07,
"loss": 0.0053,
"num_tokens": 5399271.0,
"reward": 0.5625,
"reward_std": 0.2572905421257019,
"rewards/accuracy_reward": 0.234375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.890625,
"rewards/mean_confidence_reward": 0.0,
"step": 33
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1293.0,
"completions/max_terminated_length": 1293.0,
"completions/mean_length": 326.44140625,
"completions/mean_terminated_length": 326.44140625,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.03626666666666667,
"grad_norm": 0.007888741791248322,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0093,
"num_tokens": 5524464.0,
"reward": 0.607421875,
"reward_std": 0.21399301290512085,
"rewards/accuracy_reward": 0.265625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.94921875,
"rewards/mean_confidence_reward": 0.0,
"step": 34
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1700.0,
"completions/max_terminated_length": 1700.0,
"completions/mean_length": 397.078125,
"completions/mean_terminated_length": 400.2047119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.037333333333333336,
"grad_norm": 0.006134778261184692,
"learning_rate": 8.75e-07,
"loss": -0.0065,
"num_tokens": 5671884.0,
"reward": 0.537109375,
"reward_std": 0.24372586607933044,
"rewards/accuracy_reward": 0.1796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.89453125,
"rewards/mean_confidence_reward": 0.0,
"step": 35
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1423.0,
"completions/max_terminated_length": 1423.0,
"completions/mean_length": 308.76953125,
"completions/mean_terminated_length": 308.76953125,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.0384,
"grad_norm": 0.007697598543018103,
"learning_rate": 9e-07,
"loss": -0.0068,
"num_tokens": 5790153.0,
"reward": 0.634765625,
"reward_std": 0.21994616091251373,
"rewards/accuracy_reward": 0.3046875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.96484375,
"rewards/mean_confidence_reward": 0.0,
"step": 36
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1411.0,
"completions/max_terminated_length": 1411.0,
"completions/mean_length": 349.4765625,
"completions/mean_terminated_length": 349.4765625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.039466666666666664,
"grad_norm": 0.0046683200635015965,
"learning_rate": 9.25e-07,
"loss": -0.0017,
"num_tokens": 5923227.0,
"reward": 0.609375,
"reward_std": 0.17863737046718597,
"rewards/accuracy_reward": 0.24609375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.97265625,
"rewards/mean_confidence_reward": 0.0,
"step": 37
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1770.0,
"completions/max_terminated_length": 1770.0,
"completions/mean_length": 384.5546875,
"completions/mean_terminated_length": 384.5546875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.04053333333333333,
"grad_norm": 0.006614873185753822,
"learning_rate": 9.499999999999999e-07,
"loss": -0.0055,
"num_tokens": 6065073.0,
"reward": 0.6328125,
"reward_std": 0.21142417192459106,
"rewards/accuracy_reward": 0.30859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.95703125,
"rewards/mean_confidence_reward": 0.0,
"step": 38
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1654.0,
"completions/max_terminated_length": 1654.0,
"completions/mean_length": 378.08984375,
"completions/mean_terminated_length": 378.08984375,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.0416,
"grad_norm": 0.007972519844770432,
"learning_rate": 9.75e-07,
"loss": -0.002,
"num_tokens": 6204464.0,
"reward": 0.603515625,
"reward_std": 0.19587206840515137,
"rewards/accuracy_reward": 0.2421875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.96484375,
"rewards/mean_confidence_reward": 0.0,
"step": 39
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2172.0,
"completions/max_terminated_length": 2172.0,
"completions/mean_length": 388.375,
"completions/mean_terminated_length": 391.4330749511719,
"completions/min_length": 0.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.042666666666666665,
"grad_norm": 0.029957441613078117,
"learning_rate": 1e-06,
"loss": 0.0017,
"num_tokens": 6347160.0,
"reward": 0.603515625,
"reward_std": 0.21501165628433228,
"rewards/accuracy_reward": 0.23828125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.96875,
"rewards/mean_confidence_reward": 0.0,
"step": 40
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1317.0,
"completions/max_terminated_length": 1317.0,
"completions/mean_length": 358.16015625,
"completions/mean_terminated_length": 358.16015625,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"epoch": 0.04373333333333333,
"grad_norm": 0.008916471153497696,
"learning_rate": 9.9375e-07,
"loss": -0.0008,
"num_tokens": 6482609.0,
"reward": 0.720703125,
"reward_std": 0.2066127061843872,
"rewards/accuracy_reward": 0.453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 41
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2065.0,
"completions/max_terminated_length": 2065.0,
"completions/mean_length": 333.79296875,
"completions/mean_terminated_length": 333.79296875,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.0448,
"grad_norm": 0.007723547983914614,
"learning_rate": 9.875e-07,
"loss": -0.0013,
"num_tokens": 6608940.0,
"reward": 0.658203125,
"reward_std": 0.22034893929958344,
"rewards/accuracy_reward": 0.359375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.95703125,
"rewards/mean_confidence_reward": 0.0,
"step": 42
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1617.0,
"completions/max_terminated_length": 1617.0,
"completions/mean_length": 364.55078125,
"completions/mean_terminated_length": 364.55078125,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.04586666666666667,
"grad_norm": 0.004379777703434229,
"learning_rate": 9.8125e-07,
"loss": -0.0028,
"num_tokens": 6744001.0,
"reward": 0.671875,
"reward_std": 0.17719866335391998,
"rewards/accuracy_reward": 0.34765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 43
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1617.0,
"completions/max_terminated_length": 1617.0,
"completions/mean_length": 395.99609375,
"completions/mean_terminated_length": 395.99609375,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.046933333333333334,
"grad_norm": 0.0048252190463244915,
"learning_rate": 9.75e-07,
"loss": -0.0024,
"num_tokens": 6888208.0,
"reward": 0.6328125,
"reward_std": 0.1780165135860443,
"rewards/accuracy_reward": 0.28125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 44
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1623.0,
"completions/max_terminated_length": 1623.0,
"completions/mean_length": 372.44140625,
"completions/mean_terminated_length": 372.44140625,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.048,
"grad_norm": 0.0062431166879832745,
"learning_rate": 9.6875e-07,
"loss": -0.0002,
"num_tokens": 7025113.0,
"reward": 0.681640625,
"reward_std": 0.20360445976257324,
"rewards/accuracy_reward": 0.375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 45
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2817.0,
"completions/max_terminated_length": 2817.0,
"completions/mean_length": 394.5625,
"completions/mean_terminated_length": 394.5625,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.04906666666666667,
"grad_norm": 0.0062705399468541145,
"learning_rate": 9.624999999999999e-07,
"loss": -0.0019,
"num_tokens": 7167401.0,
"reward": 0.626953125,
"reward_std": 0.19731590151786804,
"rewards/accuracy_reward": 0.296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.95703125,
"rewards/mean_confidence_reward": 0.0,
"step": 46
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2313.0,
"completions/max_terminated_length": 2313.0,
"completions/mean_length": 420.171875,
"completions/mean_terminated_length": 421.81964111328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.050133333333333335,
"grad_norm": 0.0044891973957419395,
"learning_rate": 9.5625e-07,
"loss": -0.0008,
"num_tokens": 7317453.0,
"reward": 0.634765625,
"reward_std": 0.1924583911895752,
"rewards/accuracy_reward": 0.28515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 47
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2543.0,
"completions/max_terminated_length": 2543.0,
"completions/mean_length": 386.734375,
"completions/mean_terminated_length": 388.2510070800781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.0512,
"grad_norm": 0.0046876417472958565,
"learning_rate": 9.499999999999999e-07,
"loss": -0.0054,
"num_tokens": 7456657.0,
"reward": 0.650390625,
"reward_std": 0.16893011331558228,
"rewards/accuracy_reward": 0.30859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 48
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1753.0,
"completions/max_terminated_length": 1753.0,
"completions/mean_length": 392.78125,
"completions/mean_terminated_length": 394.32159423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.05226666666666667,
"grad_norm": 0.006282743997871876,
"learning_rate": 9.4375e-07,
"loss": -0.0065,
"num_tokens": 7598257.0,
"reward": 0.712890625,
"reward_std": 0.18740198016166687,
"rewards/accuracy_reward": 0.4296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 49
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1716.0,
"completions/max_terminated_length": 1716.0,
"completions/mean_length": 422.3046875,
"completions/mean_terminated_length": 422.3046875,
"completions/min_length": 54.0,
"completions/min_terminated_length": 54.0,
"epoch": 0.05333333333333334,
"grad_norm": 0.00486365519464016,
"learning_rate": 9.374999999999999e-07,
"loss": 0.0013,
"num_tokens": 7748239.0,
"reward": 0.701171875,
"reward_std": 0.19260293245315552,
"rewards/accuracy_reward": 0.40625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 50
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1298.0,
"completions/max_terminated_length": 1298.0,
"completions/mean_length": 421.09375,
"completions/mean_terminated_length": 421.09375,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.0544,
"grad_norm": 0.0049727000296115875,
"learning_rate": 9.3125e-07,
"loss": 0.0009,
"num_tokens": 7901847.0,
"reward": 0.7109375,
"reward_std": 0.1865353286266327,
"rewards/accuracy_reward": 0.42578125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 51
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2057.0,
"completions/max_terminated_length": 2057.0,
"completions/mean_length": 438.16015625,
"completions/mean_terminated_length": 439.8784484863281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.055466666666666664,
"grad_norm": 0.004901991691440344,
"learning_rate": 9.25e-07,
"loss": -0.001,
"num_tokens": 8058480.0,
"reward": 0.755859375,
"reward_std": 0.21218523383140564,
"rewards/accuracy_reward": 0.5234375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 52
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1462.0,
"completions/max_terminated_length": 1462.0,
"completions/mean_length": 451.69921875,
"completions/mean_terminated_length": 451.69921875,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.05653333333333333,
"grad_norm": 0.004037321545183659,
"learning_rate": 9.187499999999999e-07,
"loss": -0.0004,
"num_tokens": 8216451.0,
"reward": 0.73046875,
"reward_std": 0.17419041693210602,
"rewards/accuracy_reward": 0.4609375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 53
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1951.0,
"completions/max_terminated_length": 1951.0,
"completions/mean_length": 393.94140625,
"completions/mean_terminated_length": 393.94140625,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.0576,
"grad_norm": 0.0061675142496824265,
"learning_rate": 9.124999999999999e-07,
"loss": -0.0002,
"num_tokens": 8360044.0,
"reward": 0.783203125,
"reward_std": 0.20793494582176208,
"rewards/accuracy_reward": 0.57421875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 54
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1733.0,
"completions/max_terminated_length": 1733.0,
"completions/mean_length": 409.58203125,
"completions/mean_terminated_length": 411.1882629394531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.058666666666666666,
"grad_norm": 0.004997917916625738,
"learning_rate": 9.0625e-07,
"loss": -0.0133,
"num_tokens": 8509233.0,
"reward": 0.71484375,
"reward_std": 0.1894191950559616,
"rewards/accuracy_reward": 0.4375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 55
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2430.0,
"completions/max_terminated_length": 2430.0,
"completions/mean_length": 465.42578125,
"completions/mean_terminated_length": 467.2510070800781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.05973333333333333,
"grad_norm": 0.0075267828069627285,
"learning_rate": 9e-07,
"loss": -0.0018,
"num_tokens": 8671734.0,
"reward": 0.71484375,
"reward_std": 0.17486415803432465,
"rewards/accuracy_reward": 0.4375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 56
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2337.0,
"completions/max_terminated_length": 2337.0,
"completions/mean_length": 428.4609375,
"completions/mean_terminated_length": 431.83465576171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.0608,
"grad_norm": 0.004536744672805071,
"learning_rate": 8.9375e-07,
"loss": -0.0068,
"num_tokens": 8824724.0,
"reward": 0.767578125,
"reward_std": 0.18233522772789001,
"rewards/accuracy_reward": 0.54296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 57
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2200.0,
"completions/max_terminated_length": 2200.0,
"completions/mean_length": 515.69921875,
"completions/mean_terminated_length": 515.69921875,
"completions/min_length": 96.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.06186666666666667,
"grad_norm": 0.004378114826977253,
"learning_rate": 8.874999999999999e-07,
"loss": -0.0063,
"num_tokens": 8999575.0,
"reward": 0.685546875,
"reward_std": 0.18510794639587402,
"rewards/accuracy_reward": 0.375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 58
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1520.0,
"completions/max_terminated_length": 1520.0,
"completions/mean_length": 458.90234375,
"completions/mean_terminated_length": 458.90234375,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.06293333333333333,
"grad_norm": 0.0051321005448699,
"learning_rate": 8.812499999999999e-07,
"loss": 0.008,
"num_tokens": 9159814.0,
"reward": 0.72265625,
"reward_std": 0.16925148665905,
"rewards/accuracy_reward": 0.4453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 59
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2070.0,
"completions/max_terminated_length": 2070.0,
"completions/mean_length": 482.69921875,
"completions/mean_terminated_length": 482.69921875,
"completions/min_length": 108.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.064,
"grad_norm": 0.003565035294741392,
"learning_rate": 8.75e-07,
"loss": 0.0002,
"num_tokens": 9328753.0,
"reward": 0.70703125,
"reward_std": 0.16893331706523895,
"rewards/accuracy_reward": 0.42578125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 60
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1562.0,
"completions/max_terminated_length": 1562.0,
"completions/mean_length": 422.16015625,
"completions/mean_terminated_length": 422.16015625,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.06506666666666666,
"grad_norm": 0.004943581763654947,
"learning_rate": 8.687499999999999e-07,
"loss": 0.0007,
"num_tokens": 9477402.0,
"reward": 0.794921875,
"reward_std": 0.13170689344406128,
"rewards/accuracy_reward": 0.58984375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 61
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2736.0,
"completions/max_terminated_length": 2736.0,
"completions/mean_length": 489.13671875,
"completions/mean_terminated_length": 489.13671875,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.06613333333333334,
"grad_norm": 0.005266538821160793,
"learning_rate": 8.625e-07,
"loss": -0.003,
"num_tokens": 9646213.0,
"reward": 0.763671875,
"reward_std": 0.1926351934671402,
"rewards/accuracy_reward": 0.53515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 62
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1716.0,
"completions/max_terminated_length": 1716.0,
"completions/mean_length": 522.9609375,
"completions/mean_terminated_length": 522.9609375,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.0672,
"grad_norm": 0.005230387672781944,
"learning_rate": 8.5625e-07,
"loss": 0.0009,
"num_tokens": 9825243.0,
"reward": 0.7421875,
"reward_std": 0.18372184038162231,
"rewards/accuracy_reward": 0.4921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 63
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2251.0,
"completions/max_terminated_length": 2251.0,
"completions/mean_length": 477.1953125,
"completions/mean_terminated_length": 479.0666809082031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.06826666666666667,
"grad_norm": 0.0031063214410096407,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0031,
"num_tokens": 9987693.0,
"reward": 0.765625,
"reward_std": 0.15420205891132355,
"rewards/accuracy_reward": 0.5390625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 64
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1033.0,
"completions/max_terminated_length": 1033.0,
"completions/mean_length": 409.75,
"completions/mean_terminated_length": 411.35687255859375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.06933333333333333,
"grad_norm": 0.0037314582150429487,
"learning_rate": 8.4375e-07,
"loss": -0.0005,
"num_tokens": 10134125.0,
"reward": 0.74609375,
"reward_std": 0.1410868614912033,
"rewards/accuracy_reward": 0.50390625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 65
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2992.0,
"completions/max_terminated_length": 2992.0,
"completions/mean_length": 549.0546875,
"completions/mean_terminated_length": 553.3779296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.0704,
"grad_norm": 0.002695879666134715,
"learning_rate": 8.375e-07,
"loss": 0.0005,
"num_tokens": 10317547.0,
"reward": 0.673828125,
"reward_std": 0.1499173790216446,
"rewards/accuracy_reward": 0.3671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98046875,
"rewards/mean_confidence_reward": 0.0,
"step": 66
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2855.0,
"completions/max_terminated_length": 2855.0,
"completions/mean_length": 508.328125,
"completions/mean_terminated_length": 508.328125,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.07146666666666666,
"grad_norm": 0.0026889382861554623,
"learning_rate": 8.3125e-07,
"loss": -0.003,
"num_tokens": 10489199.0,
"reward": 0.79296875,
"reward_std": 0.12855470180511475,
"rewards/accuracy_reward": 0.58984375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 67
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1921.0,
"completions/max_terminated_length": 1921.0,
"completions/mean_length": 483.36328125,
"completions/mean_terminated_length": 483.36328125,
"completions/min_length": 84.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.07253333333333334,
"grad_norm": 0.002774188295006752,
"learning_rate": 8.249999999999999e-07,
"loss": 0.001,
"num_tokens": 10653540.0,
"reward": 0.7734375,
"reward_std": 0.14229989051818848,
"rewards/accuracy_reward": 0.55078125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 68
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2838.0,
"completions/max_terminated_length": 2838.0,
"completions/mean_length": 565.65234375,
"completions/mean_terminated_length": 567.87060546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.0736,
"grad_norm": 0.0030759924557060003,
"learning_rate": 8.187499999999999e-07,
"loss": -0.0008,
"num_tokens": 10839355.0,
"reward": 0.70703125,
"reward_std": 0.17795701324939728,
"rewards/accuracy_reward": 0.42578125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 69
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1614.0,
"completions/max_terminated_length": 1614.0,
"completions/mean_length": 518.40234375,
"completions/mean_terminated_length": 520.435302734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.07466666666666667,
"grad_norm": 0.002635194454342127,
"learning_rate": 8.125e-07,
"loss": -0.0045,
"num_tokens": 11015570.0,
"reward": 0.724609375,
"reward_std": 0.1347435861825943,
"rewards/accuracy_reward": 0.45703125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 70
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2671.0,
"completions/max_terminated_length": 2671.0,
"completions/mean_length": 559.83203125,
"completions/mean_terminated_length": 562.0274658203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.07573333333333333,
"grad_norm": 0.003967128694057465,
"learning_rate": 8.0625e-07,
"loss": -0.0018,
"num_tokens": 11199807.0,
"reward": 0.734375,
"reward_std": 0.19562900066375732,
"rewards/accuracy_reward": 0.48828125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98046875,
"rewards/mean_confidence_reward": 0.0,
"step": 71
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2753.0,
"completions/max_terminated_length": 2753.0,
"completions/mean_length": 481.41796875,
"completions/mean_terminated_length": 483.305908203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.0768,
"grad_norm": 0.0023626715410500765,
"learning_rate": 8e-07,
"loss": -0.0012,
"num_tokens": 11363970.0,
"reward": 0.755859375,
"reward_std": 0.13651162385940552,
"rewards/accuracy_reward": 0.51953125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 72
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2208.0,
"completions/max_terminated_length": 2208.0,
"completions/mean_length": 507.8671875,
"completions/mean_terminated_length": 507.8671875,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.07786666666666667,
"grad_norm": 0.0034668815787881613,
"learning_rate": 7.937499999999999e-07,
"loss": 0.0011,
"num_tokens": 11537528.0,
"reward": 0.83984375,
"reward_std": 0.135064959526062,
"rewards/accuracy_reward": 0.6796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 73
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2592.0,
"completions/max_terminated_length": 2592.0,
"completions/mean_length": 519.9765625,
"completions/mean_terminated_length": 519.9765625,
"completions/min_length": 96.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.07893333333333333,
"grad_norm": 0.0036854897625744343,
"learning_rate": 7.875e-07,
"loss": 0.0001,
"num_tokens": 11711082.0,
"reward": 0.736328125,
"reward_std": 0.17774078249931335,
"rewards/accuracy_reward": 0.484375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 74
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1386.0,
"completions/max_terminated_length": 1386.0,
"completions/mean_length": 456.67578125,
"completions/mean_terminated_length": 456.67578125,
"completions/min_length": 84.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.08,
"grad_norm": 0.0030605688225477934,
"learning_rate": 7.812499999999999e-07,
"loss": -0.0,
"num_tokens": 11869255.0,
"reward": 0.8359375,
"reward_std": 0.1430206447839737,
"rewards/accuracy_reward": 0.671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 75
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1766.0,
"completions/max_terminated_length": 1766.0,
"completions/mean_length": 492.203125,
"completions/mean_terminated_length": 492.203125,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.08106666666666666,
"grad_norm": 0.002868713578209281,
"learning_rate": 7.75e-07,
"loss": 0.0006,
"num_tokens": 12034827.0,
"reward": 0.796875,
"reward_std": 0.13965940475463867,
"rewards/accuracy_reward": 0.59765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 76
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2216.0,
"completions/max_terminated_length": 2216.0,
"completions/mean_length": 490.140625,
"completions/mean_terminated_length": 492.0627746582031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.08213333333333334,
"grad_norm": 0.003421858651563525,
"learning_rate": 7.6875e-07,
"loss": -0.008,
"num_tokens": 12201479.0,
"reward": 0.8125,
"reward_std": 0.18218934535980225,
"rewards/accuracy_reward": 0.62890625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 77
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1699.0,
"completions/max_terminated_length": 1699.0,
"completions/mean_length": 583.75,
"completions/mean_terminated_length": 583.75,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.0832,
"grad_norm": 0.0027811885811388493,
"learning_rate": 7.624999999999999e-07,
"loss": 0.0025,
"num_tokens": 12395455.0,
"reward": 0.751953125,
"reward_std": 0.15598376095294952,
"rewards/accuracy_reward": 0.51171875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 78
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2660.0,
"completions/max_terminated_length": 2660.0,
"completions/mean_length": 538.19921875,
"completions/mean_terminated_length": 538.19921875,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.08426666666666667,
"grad_norm": 0.0027420504484325647,
"learning_rate": 7.5625e-07,
"loss": 0.0035,
"num_tokens": 12576122.0,
"reward": 0.779296875,
"reward_std": 0.1334904432296753,
"rewards/accuracy_reward": 0.5625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 79
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1508.0,
"completions/max_terminated_length": 1508.0,
"completions/mean_length": 496.2421875,
"completions/mean_terminated_length": 496.2421875,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.08533333333333333,
"grad_norm": 0.0035391824785619974,
"learning_rate": 7.5e-07,
"loss": 0.0011,
"num_tokens": 12741832.0,
"reward": 0.830078125,
"reward_std": 0.12697824835777283,
"rewards/accuracy_reward": 0.66015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 80
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2600.0,
"completions/max_terminated_length": 2600.0,
"completions/mean_length": 512.80078125,
"completions/mean_terminated_length": 523.0159301757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.0864,
"grad_norm": 0.002712409244850278,
"learning_rate": 7.4375e-07,
"loss": -0.0104,
"num_tokens": 12915869.0,
"reward": 0.810546875,
"reward_std": 0.16697202622890472,
"rewards/accuracy_reward": 0.64453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9765625,
"rewards/mean_confidence_reward": 0.0,
"step": 81
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1993.0,
"completions/max_terminated_length": 1993.0,
"completions/mean_length": 476.73046875,
"completions/mean_terminated_length": 478.60003662109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.08746666666666666,
"grad_norm": 0.002215381944552064,
"learning_rate": 7.375e-07,
"loss": -0.004,
"num_tokens": 13079976.0,
"reward": 0.794921875,
"reward_std": 0.11139336228370667,
"rewards/accuracy_reward": 0.59765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 82
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1611.0,
"completions/max_terminated_length": 1611.0,
"completions/mean_length": 568.62109375,
"completions/mean_terminated_length": 568.62109375,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.08853333333333334,
"grad_norm": 0.0023948366288095713,
"learning_rate": 7.312499999999999e-07,
"loss": 0.0042,
"num_tokens": 13269319.0,
"reward": 0.775390625,
"reward_std": 0.10429459810256958,
"rewards/accuracy_reward": 0.55859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 83
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1361.0,
"completions/max_terminated_length": 1361.0,
"completions/mean_length": 513.49609375,
"completions/mean_terminated_length": 513.49609375,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.0896,
"grad_norm": 0.0028763434384018183,
"learning_rate": 7.249999999999999e-07,
"loss": -0.0007,
"num_tokens": 13443206.0,
"reward": 0.787109375,
"reward_std": 0.15434354543685913,
"rewards/accuracy_reward": 0.58203125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 84
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2768.0,
"completions/max_terminated_length": 2768.0,
"completions/mean_length": 529.04296875,
"completions/mean_terminated_length": 533.2086791992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.09066666666666667,
"grad_norm": 0.002804612973704934,
"learning_rate": 7.1875e-07,
"loss": -0.0041,
"num_tokens": 13622977.0,
"reward": 0.7734375,
"reward_std": 0.1285894215106964,
"rewards/accuracy_reward": 0.5546875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 85
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2205.0,
"completions/max_terminated_length": 2205.0,
"completions/mean_length": 570.74609375,
"completions/mean_terminated_length": 575.2401733398438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.09173333333333333,
"grad_norm": 0.0030598430894315243,
"learning_rate": 7.125e-07,
"loss": 0.001,
"num_tokens": 13811112.0,
"reward": 0.779296875,
"reward_std": 0.13960447907447815,
"rewards/accuracy_reward": 0.5703125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 86
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2625.0,
"completions/max_terminated_length": 2625.0,
"completions/mean_length": 512.4140625,
"completions/mean_terminated_length": 514.423583984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.0928,
"grad_norm": 0.0032272052485495806,
"learning_rate": 7.0625e-07,
"loss": 0.0052,
"num_tokens": 13984298.0,
"reward": 0.822265625,
"reward_std": 0.16597393155097961,
"rewards/accuracy_reward": 0.65234375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 87
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1496.0,
"completions/max_terminated_length": 1496.0,
"completions/mean_length": 523.2734375,
"completions/mean_terminated_length": 523.2734375,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.09386666666666667,
"grad_norm": 0.0028333135414868593,
"learning_rate": 7e-07,
"loss": -0.0006,
"num_tokens": 14164616.0,
"reward": 0.822265625,
"reward_std": 0.12360849976539612,
"rewards/accuracy_reward": 0.6484375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 88
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1793.0,
"completions/max_terminated_length": 1793.0,
"completions/mean_length": 613.7734375,
"completions/mean_terminated_length": 613.7734375,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.09493333333333333,
"grad_norm": 0.002591961296275258,
"learning_rate": 6.937499999999999e-07,
"loss": 0.0039,
"num_tokens": 14367142.0,
"reward": 0.775390625,
"reward_std": 0.13197332620620728,
"rewards/accuracy_reward": 0.5546875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 89
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3071.0,
"completions/max_terminated_length": 3071.0,
"completions/mean_length": 558.484375,
"completions/mean_terminated_length": 558.484375,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.096,
"grad_norm": 0.00329598318785429,
"learning_rate": 6.875e-07,
"loss": 0.0013,
"num_tokens": 14549946.0,
"reward": 0.818359375,
"reward_std": 0.14798864722251892,
"rewards/accuracy_reward": 0.65234375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 90
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2182.0,
"completions/max_terminated_length": 2182.0,
"completions/mean_length": 562.3359375,
"completions/mean_terminated_length": 562.3359375,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.09706666666666666,
"grad_norm": 0.002836856758221984,
"learning_rate": 6.8125e-07,
"loss": 0.0028,
"num_tokens": 14738128.0,
"reward": 0.818359375,
"reward_std": 0.13591869175434113,
"rewards/accuracy_reward": 0.63671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 91
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1236.0,
"completions/max_terminated_length": 1236.0,
"completions/mean_length": 463.96484375,
"completions/mean_terminated_length": 469.4664306640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.09813333333333334,
"grad_norm": 0.002322397194802761,
"learning_rate": 6.75e-07,
"loss": -0.0067,
"num_tokens": 14900135.0,
"reward": 0.8125,
"reward_std": 0.11893363296985626,
"rewards/accuracy_reward": 0.63671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 92
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2141.0,
"completions/max_terminated_length": 2141.0,
"completions/mean_length": 518.546875,
"completions/mean_terminated_length": 518.546875,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.0992,
"grad_norm": 0.0025662637781351805,
"learning_rate": 6.6875e-07,
"loss": -0.0011,
"num_tokens": 15075171.0,
"reward": 0.8046875,
"reward_std": 0.13435333967208862,
"rewards/accuracy_reward": 0.6171875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 93
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2876.0,
"completions/max_terminated_length": 2876.0,
"completions/mean_length": 499.546875,
"completions/mean_terminated_length": 501.50592041015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.10026666666666667,
"grad_norm": 0.002923916559666395,
"learning_rate": 6.624999999999999e-07,
"loss": -0.0021,
"num_tokens": 15248247.0,
"reward": 0.796875,
"reward_std": 0.13210630416870117,
"rewards/accuracy_reward": 0.6015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 94
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2623.0,
"completions/max_terminated_length": 2623.0,
"completions/mean_length": 532.91015625,
"completions/mean_terminated_length": 539.229248046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.10133333333333333,
"grad_norm": 0.0024504868779331446,
"learning_rate": 6.5625e-07,
"loss": -0.0055,
"num_tokens": 15427312.0,
"reward": 0.8359375,
"reward_std": 0.14083153009414673,
"rewards/accuracy_reward": 0.6875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 95
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1772.0,
"completions/max_terminated_length": 1772.0,
"completions/mean_length": 486.4453125,
"completions/mean_terminated_length": 492.2134704589844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.1024,
"grad_norm": 0.0025696789380162954,
"learning_rate": 6.5e-07,
"loss": -0.0025,
"num_tokens": 15594170.0,
"reward": 0.84375,
"reward_std": 0.12342788279056549,
"rewards/accuracy_reward": 0.69921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 96
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1982.0,
"completions/max_terminated_length": 1982.0,
"completions/mean_length": 511.07421875,
"completions/mean_terminated_length": 511.07421875,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.10346666666666667,
"grad_norm": 0.0024422351270914078,
"learning_rate": 6.4375e-07,
"loss": -0.0021,
"num_tokens": 15766589.0,
"reward": 0.810546875,
"reward_std": 0.11323504149913788,
"rewards/accuracy_reward": 0.625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 97
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2317.0,
"completions/max_terminated_length": 2317.0,
"completions/mean_length": 542.57421875,
"completions/mean_terminated_length": 544.7019653320312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.10453333333333334,
"grad_norm": 0.0034078473690897226,
"learning_rate": 6.374999999999999e-07,
"loss": -0.0009,
"num_tokens": 15948184.0,
"reward": 0.80078125,
"reward_std": 0.1730714738368988,
"rewards/accuracy_reward": 0.61328125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 98
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2605.0,
"completions/max_terminated_length": 2605.0,
"completions/mean_length": 629.62890625,
"completions/mean_terminated_length": 637.0949096679688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.1056,
"grad_norm": 0.0025581337977200747,
"learning_rate": 6.3125e-07,
"loss": -0.0007,
"num_tokens": 16151681.0,
"reward": 0.708984375,
"reward_std": 0.1721416413784027,
"rewards/accuracy_reward": 0.4375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98046875,
"rewards/mean_confidence_reward": 0.0,
"step": 99
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2157.0,
"completions/max_terminated_length": 2157.0,
"completions/mean_length": 568.0859375,
"completions/mean_terminated_length": 570.3137817382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.10666666666666667,
"grad_norm": 0.0018688776763156056,
"learning_rate": 6.249999999999999e-07,
"loss": -0.0053,
"num_tokens": 16341031.0,
"reward": 0.7890625,
"reward_std": 0.0926247388124466,
"rewards/accuracy_reward": 0.5859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 100
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2366.0,
"completions/max_terminated_length": 2366.0,
"completions/mean_length": 577.15234375,
"completions/mean_terminated_length": 579.4157104492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.10773333333333333,
"grad_norm": 0.0025327280163764954,
"learning_rate": 6.1875e-07,
"loss": -0.0029,
"num_tokens": 16532286.0,
"reward": 0.7421875,
"reward_std": 0.1423560380935669,
"rewards/accuracy_reward": 0.48828125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 101
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1679.0,
"completions/max_terminated_length": 1679.0,
"completions/mean_length": 451.53125,
"completions/mean_terminated_length": 455.08660888671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.1088,
"grad_norm": 0.003051973879337311,
"learning_rate": 6.125000000000001e-07,
"loss": -0.0084,
"num_tokens": 16691086.0,
"reward": 0.833984375,
"reward_std": 0.11426430940628052,
"rewards/accuracy_reward": 0.67578125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 102
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1856.0,
"completions/max_terminated_length": 1856.0,
"completions/mean_length": 610.2265625,
"completions/mean_terminated_length": 610.2265625,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.10986666666666667,
"grad_norm": 0.0017390416469424963,
"learning_rate": 6.062499999999999e-07,
"loss": 0.0024,
"num_tokens": 16888368.0,
"reward": 0.791015625,
"reward_std": 0.07429992407560349,
"rewards/accuracy_reward": 0.58203125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 103
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1735.0,
"completions/max_terminated_length": 1735.0,
"completions/mean_length": 563.5859375,
"completions/mean_terminated_length": 563.5859375,
"completions/min_length": 133.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.11093333333333333,
"grad_norm": 0.0028268408495932817,
"learning_rate": 6e-07,
"loss": 0.002,
"num_tokens": 17075838.0,
"reward": 0.755859375,
"reward_std": 0.14039888978004456,
"rewards/accuracy_reward": 0.515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 104
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2530.0,
"completions/max_terminated_length": 2530.0,
"completions/mean_length": 549.6953125,
"completions/mean_terminated_length": 554.0236206054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.112,
"grad_norm": 0.0031766602769494057,
"learning_rate": 5.937499999999999e-07,
"loss": 0.0043,
"num_tokens": 17258832.0,
"reward": 0.791015625,
"reward_std": 0.13753846287727356,
"rewards/accuracy_reward": 0.59765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 105
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1202.0,
"completions/max_terminated_length": 1202.0,
"completions/mean_length": 513.44140625,
"completions/mean_terminated_length": 513.44140625,
"completions/min_length": 203.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.11306666666666666,
"grad_norm": 0.002571301767602563,
"learning_rate": 5.875e-07,
"loss": -0.0011,
"num_tokens": 17431369.0,
"reward": 0.8125,
"reward_std": 0.10934460163116455,
"rewards/accuracy_reward": 0.625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 106
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1475.0,
"completions/max_terminated_length": 1475.0,
"completions/mean_length": 508.31640625,
"completions/mean_terminated_length": 512.3189086914062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.11413333333333334,
"grad_norm": 0.0027525799814611673,
"learning_rate": 5.8125e-07,
"loss": -0.0027,
"num_tokens": 17602626.0,
"reward": 0.830078125,
"reward_std": 0.15401843190193176,
"rewards/accuracy_reward": 0.66796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 107
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1962.0,
"completions/max_terminated_length": 1962.0,
"completions/mean_length": 591.921875,
"completions/mean_terminated_length": 594.2431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.1152,
"grad_norm": 0.0024943354073911905,
"learning_rate": 5.749999999999999e-07,
"loss": -0.0092,
"num_tokens": 17793902.0,
"reward": 0.845703125,
"reward_std": 0.15900494158267975,
"rewards/accuracy_reward": 0.6953125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 108
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3015.0,
"completions/max_terminated_length": 3015.0,
"completions/mean_length": 554.36328125,
"completions/mean_terminated_length": 556.5372924804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.11626666666666667,
"grad_norm": 0.0026299208402633667,
"learning_rate": 5.6875e-07,
"loss": -0.0007,
"num_tokens": 17976931.0,
"reward": 0.740234375,
"reward_std": 0.10120174288749695,
"rewards/accuracy_reward": 0.49609375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 109
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2637.0,
"completions/max_terminated_length": 2637.0,
"completions/mean_length": 503.09765625,
"completions/mean_terminated_length": 503.09765625,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.11733333333333333,
"grad_norm": 0.0026612987276166677,
"learning_rate": 5.625e-07,
"loss": 0.0071,
"num_tokens": 18147156.0,
"reward": 0.79296875,
"reward_std": 0.13546374440193176,
"rewards/accuracy_reward": 0.59765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 110
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2910.0,
"completions/max_terminated_length": 2910.0,
"completions/mean_length": 523.625,
"completions/mean_terminated_length": 529.833984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.1184,
"grad_norm": 0.002410402987152338,
"learning_rate": 5.5625e-07,
"loss": -0.0053,
"num_tokens": 18325124.0,
"reward": 0.798828125,
"reward_std": 0.132163405418396,
"rewards/accuracy_reward": 0.61328125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 111
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3054.0,
"completions/max_terminated_length": 3054.0,
"completions/mean_length": 615.296875,
"completions/mean_terminated_length": 615.296875,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.11946666666666667,
"grad_norm": 0.002935344586148858,
"learning_rate": 5.5e-07,
"loss": -0.0008,
"num_tokens": 18527072.0,
"reward": 0.765625,
"reward_std": 0.11862511187791824,
"rewards/accuracy_reward": 0.53515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 112
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2083.0,
"completions/max_terminated_length": 2083.0,
"completions/mean_length": 466.8203125,
"completions/mean_terminated_length": 468.6510009765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.12053333333333334,
"grad_norm": 0.002842753892764449,
"learning_rate": 5.4375e-07,
"loss": 0.0046,
"num_tokens": 18688290.0,
"reward": 0.765625,
"reward_std": 0.12914568185806274,
"rewards/accuracy_reward": 0.53515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 113
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2560.0,
"completions/max_terminated_length": 2560.0,
"completions/mean_length": 488.6015625,
"completions/mean_terminated_length": 492.4488220214844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.1216,
"grad_norm": 0.002613338641822338,
"learning_rate": 5.374999999999999e-07,
"loss": -0.0027,
"num_tokens": 18854908.0,
"reward": 0.8046875,
"reward_std": 0.10021381080150604,
"rewards/accuracy_reward": 0.62109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 114
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1612.0,
"completions/max_terminated_length": 1612.0,
"completions/mean_length": 500.20703125,
"completions/mean_terminated_length": 502.1686706542969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.12266666666666666,
"grad_norm": 0.0021633750293403864,
"learning_rate": 5.3125e-07,
"loss": -0.0012,
"num_tokens": 19024737.0,
"reward": 0.798828125,
"reward_std": 0.1338150054216385,
"rewards/accuracy_reward": 0.6015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 115
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1410.0,
"completions/max_terminated_length": 1410.0,
"completions/mean_length": 557.59765625,
"completions/mean_terminated_length": 557.59765625,
"completions/min_length": 84.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.12373333333333333,
"grad_norm": 0.002700052922591567,
"learning_rate": 5.25e-07,
"loss": -0.0001,
"num_tokens": 19208514.0,
"reward": 0.8125,
"reward_std": 0.15344911813735962,
"rewards/accuracy_reward": 0.62890625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 116
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1556.0,
"completions/max_terminated_length": 1556.0,
"completions/mean_length": 526.42578125,
"completions/mean_terminated_length": 526.42578125,
"completions/min_length": 123.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.1248,
"grad_norm": 0.0025196054484695196,
"learning_rate": 5.1875e-07,
"loss": 0.0044,
"num_tokens": 19386391.0,
"reward": 0.75390625,
"reward_std": 0.14459392428398132,
"rewards/accuracy_reward": 0.5078125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 117
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2774.0,
"completions/max_terminated_length": 2774.0,
"completions/mean_length": 566.13671875,
"completions/mean_terminated_length": 570.594482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.12586666666666665,
"grad_norm": 0.00227278470993042,
"learning_rate": 5.125e-07,
"loss": -0.0063,
"num_tokens": 19571842.0,
"reward": 0.806640625,
"reward_std": 0.11597176641225815,
"rewards/accuracy_reward": 0.62109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 118
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2020.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 550.25,
"completions/mean_terminated_length": 554.5827026367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.12693333333333334,
"grad_norm": 0.0028481134213507175,
"learning_rate": 5.062499999999999e-07,
"loss": -0.0069,
"num_tokens": 19754282.0,
"reward": 0.78125,
"reward_std": 0.11709947139024734,
"rewards/accuracy_reward": 0.5703125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 119
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1418.0,
"completions/max_terminated_length": 1418.0,
"completions/mean_length": 506.68359375,
"completions/mean_terminated_length": 508.6706237792969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.128,
"grad_norm": 0.002370421774685383,
"learning_rate": 5e-07,
"loss": -0.0013,
"num_tokens": 19927193.0,
"reward": 0.83984375,
"reward_std": 0.09850388765335083,
"rewards/accuracy_reward": 0.68359375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 120
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2703.0,
"completions/max_terminated_length": 2703.0,
"completions/mean_length": 610.82421875,
"completions/mean_terminated_length": 610.82421875,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.12906666666666666,
"grad_norm": 0.0025763448793441057,
"learning_rate": 4.9375e-07,
"loss": -0.0004,
"num_tokens": 20125132.0,
"reward": 0.83203125,
"reward_std": 0.1545259803533554,
"rewards/accuracy_reward": 0.671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 121
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2990.0,
"completions/max_terminated_length": 2990.0,
"completions/mean_length": 539.0859375,
"completions/mean_terminated_length": 541.2000122070312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.13013333333333332,
"grad_norm": 0.002326871268451214,
"learning_rate": 4.875e-07,
"loss": -0.003,
"num_tokens": 20306994.0,
"reward": 0.80859375,
"reward_std": 0.10887734591960907,
"rewards/accuracy_reward": 0.625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 122
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1986.0,
"completions/max_terminated_length": 1986.0,
"completions/mean_length": 593.75,
"completions/mean_terminated_length": 600.79052734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.1312,
"grad_norm": 0.0019709800835698843,
"learning_rate": 4.812499999999999e-07,
"loss": -0.0078,
"num_tokens": 20500794.0,
"reward": 0.81640625,
"reward_std": 0.11907092481851578,
"rewards/accuracy_reward": 0.64453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 123
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2476.0,
"completions/max_terminated_length": 2476.0,
"completions/mean_length": 590.7890625,
"completions/mean_terminated_length": 593.1058959960938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.13226666666666667,
"grad_norm": 0.0023778609465807676,
"learning_rate": 4.7499999999999995e-07,
"loss": -0.0015,
"num_tokens": 20695364.0,
"reward": 0.818359375,
"reward_std": 0.13847261667251587,
"rewards/accuracy_reward": 0.64453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 124
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2717.0,
"completions/max_terminated_length": 2717.0,
"completions/mean_length": 595.015625,
"completions/mean_terminated_length": 595.015625,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.13333333333333333,
"grad_norm": 0.002285284223034978,
"learning_rate": 4.6874999999999996e-07,
"loss": 0.0015,
"num_tokens": 20889008.0,
"reward": 0.7890625,
"reward_std": 0.15789635479450226,
"rewards/accuracy_reward": 0.58203125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 125
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2110.0,
"completions/max_terminated_length": 2110.0,
"completions/mean_length": 547.8515625,
"completions/mean_terminated_length": 547.8515625,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.1344,
"grad_norm": 0.002394335111603141,
"learning_rate": 4.625e-07,
"loss": 0.0031,
"num_tokens": 21071234.0,
"reward": 0.767578125,
"reward_std": 0.12618385255336761,
"rewards/accuracy_reward": 0.53515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 126
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2732.0,
"completions/max_terminated_length": 2732.0,
"completions/mean_length": 534.67578125,
"completions/mean_terminated_length": 547.5079956054688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.13546666666666668,
"grad_norm": 0.0030156190041452646,
"learning_rate": 4.5624999999999997e-07,
"loss": -0.0119,
"num_tokens": 21248295.0,
"reward": 0.763671875,
"reward_std": 0.16096064448356628,
"rewards/accuracy_reward": 0.5546875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.97265625,
"rewards/mean_confidence_reward": 0.0,
"step": 127
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2875.0,
"completions/max_terminated_length": 2875.0,
"completions/mean_length": 575.37109375,
"completions/mean_terminated_length": 584.5040283203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.13653333333333334,
"grad_norm": 0.0038420057389885187,
"learning_rate": 4.5e-07,
"loss": -0.0046,
"num_tokens": 21438766.0,
"reward": 0.75390625,
"reward_std": 0.1539350003004074,
"rewards/accuracy_reward": 0.53515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.97265625,
"rewards/mean_confidence_reward": 0.0,
"step": 128
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1697.0,
"completions/max_terminated_length": 1697.0,
"completions/mean_length": 486.66796875,
"completions/mean_terminated_length": 488.5765075683594,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.1376,
"grad_norm": 0.002230524318292737,
"learning_rate": 4.4374999999999993e-07,
"loss": -0.0019,
"num_tokens": 21602249.0,
"reward": 0.8203125,
"reward_std": 0.11866964399814606,
"rewards/accuracy_reward": 0.64453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 129
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2151.0,
"completions/max_terminated_length": 2151.0,
"completions/mean_length": 491.59765625,
"completions/mean_terminated_length": 491.59765625,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.13866666666666666,
"grad_norm": 0.002271746750921011,
"learning_rate": 4.375e-07,
"loss": 0.0023,
"num_tokens": 21769898.0,
"reward": 0.833984375,
"reward_std": 0.07088252902030945,
"rewards/accuracy_reward": 0.66796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 130
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1864.0,
"completions/max_terminated_length": 1864.0,
"completions/mean_length": 522.3203125,
"completions/mean_terminated_length": 528.5138549804688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 207.0,
"epoch": 0.13973333333333332,
"grad_norm": 0.002564500318840146,
"learning_rate": 4.3125e-07,
"loss": -0.0087,
"num_tokens": 21946332.0,
"reward": 0.7109375,
"reward_std": 0.11090625822544098,
"rewards/accuracy_reward": 0.4375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 131
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2636.0,
"completions/max_terminated_length": 2636.0,
"completions/mean_length": 551.3359375,
"completions/mean_terminated_length": 551.3359375,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.1408,
"grad_norm": 0.0024552117101848125,
"learning_rate": 4.2499999999999995e-07,
"loss": 0.0003,
"num_tokens": 22129578.0,
"reward": 0.84765625,
"reward_std": 0.12388662993907928,
"rewards/accuracy_reward": 0.69921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 132
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2833.0,
"completions/max_terminated_length": 2833.0,
"completions/mean_length": 654.1171875,
"completions/mean_terminated_length": 659.2677001953125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.14186666666666667,
"grad_norm": 0.002368397079408169,
"learning_rate": 4.1875e-07,
"loss": -0.0029,
"num_tokens": 22339888.0,
"reward": 0.728515625,
"reward_std": 0.1766941398382187,
"rewards/accuracy_reward": 0.4765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98046875,
"rewards/mean_confidence_reward": 0.0,
"step": 133
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2714.0,
"completions/max_terminated_length": 2714.0,
"completions/mean_length": 608.35546875,
"completions/mean_terminated_length": 610.7412109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.14293333333333333,
"grad_norm": 0.0026498744264245033,
"learning_rate": 4.1249999999999997e-07,
"loss": -0.0001,
"num_tokens": 22541091.0,
"reward": 0.765625,
"reward_std": 0.16226203739643097,
"rewards/accuracy_reward": 0.55078125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98046875,
"rewards/mean_confidence_reward": 0.0,
"step": 134
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1930.0,
"completions/max_terminated_length": 1930.0,
"completions/mean_length": 557.61328125,
"completions/mean_terminated_length": 557.61328125,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.144,
"grad_norm": 0.002309601753950119,
"learning_rate": 4.0625e-07,
"loss": 0.0022,
"num_tokens": 22726232.0,
"reward": 0.8359375,
"reward_std": 0.11928972601890564,
"rewards/accuracy_reward": 0.67578125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 135
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1606.0,
"completions/max_terminated_length": 1606.0,
"completions/mean_length": 530.93359375,
"completions/mean_terminated_length": 533.0157470703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.14506666666666668,
"grad_norm": 0.004032180644571781,
"learning_rate": 4e-07,
"loss": -0.0017,
"num_tokens": 22907151.0,
"reward": 0.79296875,
"reward_std": 0.151150643825531,
"rewards/accuracy_reward": 0.58984375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 136
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2467.0,
"completions/max_terminated_length": 2467.0,
"completions/mean_length": 562.1875,
"completions/mean_terminated_length": 564.3922119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.14613333333333334,
"grad_norm": 0.00222558225505054,
"learning_rate": 3.9375e-07,
"loss": 0.0038,
"num_tokens": 23094567.0,
"reward": 0.79296875,
"reward_std": 0.11783070862293243,
"rewards/accuracy_reward": 0.58984375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 137
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1617.0,
"completions/max_terminated_length": 1617.0,
"completions/mean_length": 519.13671875,
"completions/mean_terminated_length": 527.3770141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.1472,
"grad_norm": 0.0021844832226634026,
"learning_rate": 3.875e-07,
"loss": -0.0073,
"num_tokens": 23268314.0,
"reward": 0.82421875,
"reward_std": 0.12335902452468872,
"rewards/accuracy_reward": 0.6640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 138
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2640.0,
"completions/max_terminated_length": 2640.0,
"completions/mean_length": 500.19921875,
"completions/mean_terminated_length": 502.16082763671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.14826666666666666,
"grad_norm": 0.002530711703002453,
"learning_rate": 3.8124999999999995e-07,
"loss": -0.0005,
"num_tokens": 23435973.0,
"reward": 0.80859375,
"reward_std": 0.11682993173599243,
"rewards/accuracy_reward": 0.62109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 139
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2687.0,
"completions/max_terminated_length": 2687.0,
"completions/mean_length": 519.8203125,
"completions/mean_terminated_length": 519.8203125,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.14933333333333335,
"grad_norm": 0.0030080152209848166,
"learning_rate": 3.75e-07,
"loss": 0.0,
"num_tokens": 23610575.0,
"reward": 0.8203125,
"reward_std": 0.12585808336734772,
"rewards/accuracy_reward": 0.640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 140
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2145.0,
"completions/max_terminated_length": 2145.0,
"completions/mean_length": 583.33203125,
"completions/mean_terminated_length": 583.33203125,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.1504,
"grad_norm": 0.0019059600308537483,
"learning_rate": 3.6875e-07,
"loss": -0.0007,
"num_tokens": 23803516.0,
"reward": 0.828125,
"reward_std": 0.1069924533367157,
"rewards/accuracy_reward": 0.66015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 141
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3018.0,
"completions/max_terminated_length": 3018.0,
"completions/mean_length": 614.05859375,
"completions/mean_terminated_length": 614.05859375,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.15146666666666667,
"grad_norm": 0.002020376967266202,
"learning_rate": 3.6249999999999997e-07,
"loss": 0.0034,
"num_tokens": 24002387.0,
"reward": 0.76953125,
"reward_std": 0.11772558093070984,
"rewards/accuracy_reward": 0.54296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 142
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2730.0,
"completions/max_terminated_length": 2730.0,
"completions/mean_length": 571.609375,
"completions/mean_terminated_length": 576.1102294921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.15253333333333333,
"grad_norm": 0.0026409905403852463,
"learning_rate": 3.5625e-07,
"loss": 0.0016,
"num_tokens": 24192567.0,
"reward": 0.7734375,
"reward_std": 0.1352597177028656,
"rewards/accuracy_reward": 0.55859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 143
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1415.0,
"completions/max_terminated_length": 1415.0,
"completions/mean_length": 521.58203125,
"completions/mean_terminated_length": 521.58203125,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.1536,
"grad_norm": 0.002217566827312112,
"learning_rate": 3.5e-07,
"loss": 0.0033,
"num_tokens": 24366732.0,
"reward": 0.845703125,
"reward_std": 0.12046800553798676,
"rewards/accuracy_reward": 0.6953125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 144
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1565.0,
"completions/max_terminated_length": 1565.0,
"completions/mean_length": 503.8359375,
"completions/mean_terminated_length": 505.8117980957031,
"completions/min_length": 0.0,
"completions/min_terminated_length": 213.0,
"epoch": 0.15466666666666667,
"grad_norm": 0.002671412192285061,
"learning_rate": 3.4375e-07,
"loss": 0.005,
"num_tokens": 24534930.0,
"reward": 0.8203125,
"reward_std": 0.1498369574546814,
"rewards/accuracy_reward": 0.64453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 145
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2618.0,
"completions/max_terminated_length": 2618.0,
"completions/mean_length": 573.640625,
"completions/mean_terminated_length": 580.4426879882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.15573333333333333,
"grad_norm": 0.0029542739503085613,
"learning_rate": 3.375e-07,
"loss": -0.0058,
"num_tokens": 24725510.0,
"reward": 0.736328125,
"reward_std": 0.14751780033111572,
"rewards/accuracy_reward": 0.49609375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9765625,
"rewards/mean_confidence_reward": 0.0,
"step": 146
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1795.0,
"completions/max_terminated_length": 1795.0,
"completions/mean_length": 584.8515625,
"completions/mean_terminated_length": 587.1451416015625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.1568,
"grad_norm": 0.0021321328822523355,
"learning_rate": 3.3124999999999995e-07,
"loss": -0.0033,
"num_tokens": 24915424.0,
"reward": 0.75390625,
"reward_std": 0.11257164925336838,
"rewards/accuracy_reward": 0.51171875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 147
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2517.0,
"completions/max_terminated_length": 2517.0,
"completions/mean_length": 514.10546875,
"completions/mean_terminated_length": 514.10546875,
"completions/min_length": 80.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.15786666666666666,
"grad_norm": 0.0019005911890417337,
"learning_rate": 3.25e-07,
"loss": 0.0015,
"num_tokens": 25088659.0,
"reward": 0.865234375,
"reward_std": 0.09262597560882568,
"rewards/accuracy_reward": 0.734375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 148
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1843.0,
"completions/max_terminated_length": 1843.0,
"completions/mean_length": 584.625,
"completions/mean_terminated_length": 586.9176635742188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.15893333333333334,
"grad_norm": 0.002471345942467451,
"learning_rate": 3.1874999999999997e-07,
"loss": -0.0016,
"num_tokens": 25279291.0,
"reward": 0.80078125,
"reward_std": 0.13565543293952942,
"rewards/accuracy_reward": 0.609375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 149
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2972.0,
"completions/max_terminated_length": 2972.0,
"completions/mean_length": 484.1328125,
"completions/mean_terminated_length": 484.1328125,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.16,
"grad_norm": 0.003143973182886839,
"learning_rate": 3.1249999999999997e-07,
"loss": 0.0027,
"num_tokens": 25444701.0,
"reward": 0.798828125,
"reward_std": 0.11844512820243835,
"rewards/accuracy_reward": 0.6015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 150
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2295.0,
"completions/max_terminated_length": 2295.0,
"completions/mean_length": 572.359375,
"completions/mean_terminated_length": 581.4444580078125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.16106666666666666,
"grad_norm": 0.0032989559695124626,
"learning_rate": 3.0625000000000003e-07,
"loss": -0.0111,
"num_tokens": 25634761.0,
"reward": 0.7578125,
"reward_std": 0.15691381692886353,
"rewards/accuracy_reward": 0.53125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 151
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2817.0,
"completions/max_terminated_length": 2817.0,
"completions/mean_length": 578.6015625,
"completions/mean_terminated_length": 578.6015625,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.16213333333333332,
"grad_norm": 0.002348927780985832,
"learning_rate": 3e-07,
"loss": 0.0017,
"num_tokens": 25824787.0,
"reward": 0.794921875,
"reward_std": 0.10950467735528946,
"rewards/accuracy_reward": 0.59375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 152
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1806.0,
"completions/max_terminated_length": 1806.0,
"completions/mean_length": 564.3515625,
"completions/mean_terminated_length": 564.3515625,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.1632,
"grad_norm": 0.00277383578941226,
"learning_rate": 2.9375e-07,
"loss": 0.003,
"num_tokens": 26013093.0,
"reward": 0.818359375,
"reward_std": 0.1355985552072525,
"rewards/accuracy_reward": 0.640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 153
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1531.0,
"completions/max_terminated_length": 1531.0,
"completions/mean_length": 501.76953125,
"completions/mean_terminated_length": 507.7193908691406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.16426666666666667,
"grad_norm": 0.0025456154253333807,
"learning_rate": 2.8749999999999995e-07,
"loss": -0.0028,
"num_tokens": 26182498.0,
"reward": 0.75390625,
"reward_std": 0.11767856031656265,
"rewards/accuracy_reward": 0.52734375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98046875,
"rewards/mean_confidence_reward": 0.0,
"step": 154
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1133.0,
"completions/max_terminated_length": 1133.0,
"completions/mean_length": 484.03515625,
"completions/mean_terminated_length": 485.933349609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.16533333333333333,
"grad_norm": 0.0033706254325807095,
"learning_rate": 2.8125e-07,
"loss": -0.001,
"num_tokens": 26350139.0,
"reward": 0.765625,
"reward_std": 0.17656250298023224,
"rewards/accuracy_reward": 0.54296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 155
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2030.0,
"completions/max_terminated_length": 2030.0,
"completions/mean_length": 532.453125,
"completions/mean_terminated_length": 532.453125,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.1664,
"grad_norm": 0.0025104114320129156,
"learning_rate": 2.75e-07,
"loss": 0.0007,
"num_tokens": 26527719.0,
"reward": 0.78125,
"reward_std": 0.1339460164308548,
"rewards/accuracy_reward": 0.5625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 156
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1410.0,
"completions/max_terminated_length": 1410.0,
"completions/mean_length": 511.8046875,
"completions/mean_terminated_length": 513.811767578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.16746666666666668,
"grad_norm": 0.002587872790172696,
"learning_rate": 2.6874999999999997e-07,
"loss": -0.002,
"num_tokens": 26698981.0,
"reward": 0.845703125,
"reward_std": 0.09791658818721771,
"rewards/accuracy_reward": 0.6953125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 157
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1684.0,
"completions/max_terminated_length": 1684.0,
"completions/mean_length": 527.10546875,
"completions/mean_terminated_length": 533.3557739257812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.16853333333333334,
"grad_norm": 0.0027954294346272945,
"learning_rate": 2.625e-07,
"loss": -0.0081,
"num_tokens": 26875672.0,
"reward": 0.806640625,
"reward_std": 0.15520510077476501,
"rewards/accuracy_reward": 0.625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 158
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1546.0,
"completions/max_terminated_length": 1546.0,
"completions/mean_length": 494.796875,
"completions/mean_terminated_length": 498.6929016113281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.1696,
"grad_norm": 0.003982344176620245,
"learning_rate": 2.5625e-07,
"loss": 0.0001,
"num_tokens": 27043636.0,
"reward": 0.798828125,
"reward_std": 0.13756868243217468,
"rewards/accuracy_reward": 0.61328125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 159
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2865.0,
"completions/max_terminated_length": 2865.0,
"completions/mean_length": 550.4765625,
"completions/mean_terminated_length": 550.4765625,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.17066666666666666,
"grad_norm": 0.0038785592187196016,
"learning_rate": 2.5e-07,
"loss": 0.002,
"num_tokens": 27225910.0,
"reward": 0.798828125,
"reward_std": 0.1503000259399414,
"rewards/accuracy_reward": 0.60546875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 160
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1685.0,
"completions/max_terminated_length": 1685.0,
"completions/mean_length": 507.02734375,
"completions/mean_terminated_length": 507.02734375,
"completions/min_length": 96.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.17173333333333332,
"grad_norm": 0.0029769681859761477,
"learning_rate": 2.4375e-07,
"loss": 0.0046,
"num_tokens": 27396141.0,
"reward": 0.841796875,
"reward_std": 0.11224833130836487,
"rewards/accuracy_reward": 0.68359375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 161
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2254.0,
"completions/max_terminated_length": 2254.0,
"completions/mean_length": 473.375,
"completions/mean_terminated_length": 475.2314147949219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.1728,
"grad_norm": 0.0020322096534073353,
"learning_rate": 2.3749999999999998e-07,
"loss": -0.0021,
"num_tokens": 27557981.0,
"reward": 0.873046875,
"reward_std": 0.07698607444763184,
"rewards/accuracy_reward": 0.75,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 162
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2614.0,
"completions/max_terminated_length": 2614.0,
"completions/mean_length": 546.265625,
"completions/mean_terminated_length": 550.5669555664062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.17386666666666667,
"grad_norm": 0.0023446192499250174,
"learning_rate": 2.3125e-07,
"loss": -0.0021,
"num_tokens": 27739169.0,
"reward": 0.79296875,
"reward_std": 0.12823016941547394,
"rewards/accuracy_reward": 0.59375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 163
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2537.0,
"completions/max_terminated_length": 2537.0,
"completions/mean_length": 601.3359375,
"completions/mean_terminated_length": 606.0708618164062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.17493333333333333,
"grad_norm": 0.0024411152116954327,
"learning_rate": 2.25e-07,
"loss": -0.0019,
"num_tokens": 27935759.0,
"reward": 0.7734375,
"reward_std": 0.13402026891708374,
"rewards/accuracy_reward": 0.55859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 164
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1479.0,
"completions/max_terminated_length": 1479.0,
"completions/mean_length": 562.64453125,
"completions/mean_terminated_length": 562.64453125,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.176,
"grad_norm": 0.0035210850182920694,
"learning_rate": 2.1875e-07,
"loss": -0.0001,
"num_tokens": 28121884.0,
"reward": 0.767578125,
"reward_std": 0.12033380568027496,
"rewards/accuracy_reward": 0.53515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 165
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2370.0,
"completions/max_terminated_length": 2370.0,
"completions/mean_length": 567.9296875,
"completions/mean_terminated_length": 570.1569213867188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.17706666666666668,
"grad_norm": 0.00238078273832798,
"learning_rate": 2.1249999999999998e-07,
"loss": -0.0027,
"num_tokens": 28309970.0,
"reward": 0.85546875,
"reward_std": 0.10644911229610443,
"rewards/accuracy_reward": 0.71484375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 166
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1542.0,
"completions/max_terminated_length": 1542.0,
"completions/mean_length": 532.76171875,
"completions/mean_terminated_length": 534.8510131835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.17813333333333334,
"grad_norm": 0.002241513691842556,
"learning_rate": 2.0624999999999998e-07,
"loss": -0.0021,
"num_tokens": 28488477.0,
"reward": 0.857421875,
"reward_std": 0.12674342095851898,
"rewards/accuracy_reward": 0.72265625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 167
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1715.0,
"completions/max_terminated_length": 1715.0,
"completions/mean_length": 560.01953125,
"completions/mean_terminated_length": 562.2156982421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.1792,
"grad_norm": 0.00295989029109478,
"learning_rate": 2e-07,
"loss": -0.0001,
"num_tokens": 28673026.0,
"reward": 0.84375,
"reward_std": 0.15729182958602905,
"rewards/accuracy_reward": 0.69140625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 168
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2230.0,
"completions/max_terminated_length": 2230.0,
"completions/mean_length": 544.15234375,
"completions/mean_terminated_length": 544.15234375,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.18026666666666666,
"grad_norm": 0.0020053465850651264,
"learning_rate": 1.9375e-07,
"loss": -0.0003,
"num_tokens": 28853025.0,
"reward": 0.76171875,
"reward_std": 0.11323626339435577,
"rewards/accuracy_reward": 0.52734375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 169
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1223.0,
"completions/max_terminated_length": 1223.0,
"completions/mean_length": 551.76953125,
"completions/mean_terminated_length": 551.76953125,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.18133333333333335,
"grad_norm": 0.0023105042055249214,
"learning_rate": 1.875e-07,
"loss": 0.0008,
"num_tokens": 29034942.0,
"reward": 0.833984375,
"reward_std": 0.08292023092508316,
"rewards/accuracy_reward": 0.66796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 170
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1238.0,
"completions/max_terminated_length": 1238.0,
"completions/mean_length": 517.66015625,
"completions/mean_terminated_length": 519.6902465820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.1824,
"grad_norm": 0.0039973389357328415,
"learning_rate": 1.8124999999999999e-07,
"loss": -0.0003,
"num_tokens": 29210871.0,
"reward": 0.76953125,
"reward_std": 0.1397336721420288,
"rewards/accuracy_reward": 0.546875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 171
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2051.0,
"completions/max_terminated_length": 2051.0,
"completions/mean_length": 497.28515625,
"completions/mean_terminated_length": 505.1785888671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.18346666666666667,
"grad_norm": 0.0022053951397538185,
"learning_rate": 1.75e-07,
"loss": -0.0104,
"num_tokens": 29378040.0,
"reward": 0.828125,
"reward_std": 0.10887734591960907,
"rewards/accuracy_reward": 0.671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 172
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1904.0,
"completions/max_terminated_length": 1904.0,
"completions/mean_length": 571.92578125,
"completions/mean_terminated_length": 576.4291381835938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.18453333333333333,
"grad_norm": 0.002169808140024543,
"learning_rate": 1.6875e-07,
"loss": -0.0046,
"num_tokens": 29564125.0,
"reward": 0.875,
"reward_std": 0.10419078171253204,
"rewards/accuracy_reward": 0.7578125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 173
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2770.0,
"completions/max_terminated_length": 2770.0,
"completions/mean_length": 601.39453125,
"completions/mean_terminated_length": 608.5256958007812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.1856,
"grad_norm": 0.002962428145110607,
"learning_rate": 1.625e-07,
"loss": -0.0047,
"num_tokens": 29758826.0,
"reward": 0.759765625,
"reward_std": 0.17846155166625977,
"rewards/accuracy_reward": 0.53515625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 174
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 3038.0,
"completions/max_terminated_length": 3038.0,
"completions/mean_length": 544.2578125,
"completions/mean_terminated_length": 557.3200073242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.18666666666666668,
"grad_norm": 0.002639415208250284,
"learning_rate": 1.5624999999999999e-07,
"loss": -0.0039,
"num_tokens": 29940492.0,
"reward": 0.685546875,
"reward_std": 0.12559287250041962,
"rewards/accuracy_reward": 0.40234375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.96875,
"rewards/mean_confidence_reward": 0.0,
"step": 175
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2489.0,
"completions/max_terminated_length": 2489.0,
"completions/mean_length": 552.74609375,
"completions/mean_terminated_length": 554.9137573242188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.18773333333333334,
"grad_norm": 0.003247645916417241,
"learning_rate": 1.5e-07,
"loss": 0.0037,
"num_tokens": 30122571.0,
"reward": 0.794921875,
"reward_std": 0.12789133191108704,
"rewards/accuracy_reward": 0.6015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 176
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 3020.0,
"completions/max_terminated_length": 3020.0,
"completions/mean_length": 527.73046875,
"completions/mean_terminated_length": 529.800048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.1888,
"grad_norm": 0.002995978808030486,
"learning_rate": 1.4374999999999997e-07,
"loss": -0.0011,
"num_tokens": 30298014.0,
"reward": 0.79296875,
"reward_std": 0.11722171306610107,
"rewards/accuracy_reward": 0.59375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 177
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1377.0,
"completions/max_terminated_length": 1377.0,
"completions/mean_length": 530.21484375,
"completions/mean_terminated_length": 536.5020141601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 47.0,
"epoch": 0.18986666666666666,
"grad_norm": 0.0029503873083740473,
"learning_rate": 1.375e-07,
"loss": -0.0019,
"num_tokens": 30476333.0,
"reward": 0.818359375,
"reward_std": 0.1244158148765564,
"rewards/accuracy_reward": 0.6484375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 178
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2929.0,
"completions/max_terminated_length": 2929.0,
"completions/mean_length": 548.2890625,
"completions/mean_terminated_length": 554.79052734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.19093333333333334,
"grad_norm": 0.0034041572362184525,
"learning_rate": 1.3125e-07,
"loss": -0.0076,
"num_tokens": 30659471.0,
"reward": 0.8203125,
"reward_std": 0.12339387834072113,
"rewards/accuracy_reward": 0.66015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98046875,
"rewards/mean_confidence_reward": 0.0,
"step": 179
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2633.0,
"completions/max_terminated_length": 2633.0,
"completions/mean_length": 642.43359375,
"completions/mean_terminated_length": 644.9530029296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.192,
"grad_norm": 0.003008870640769601,
"learning_rate": 1.25e-07,
"loss": 0.0016,
"num_tokens": 30864302.0,
"reward": 0.802734375,
"reward_std": 0.0949535220861435,
"rewards/accuracy_reward": 0.61328125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 180
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 1660.0,
"completions/max_terminated_length": 1660.0,
"completions/mean_length": 495.234375,
"completions/mean_terminated_length": 501.10675048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.19306666666666666,
"grad_norm": 0.004007770214229822,
"learning_rate": 1.1874999999999999e-07,
"loss": -0.0062,
"num_tokens": 31033858.0,
"reward": 0.7890625,
"reward_std": 0.163752019405365,
"rewards/accuracy_reward": 0.59375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 181
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1647.0,
"completions/max_terminated_length": 1647.0,
"completions/mean_length": 541.2734375,
"completions/mean_terminated_length": 543.3961181640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.19413333333333332,
"grad_norm": 0.002634539268910885,
"learning_rate": 1.125e-07,
"loss": -0.0029,
"num_tokens": 31215096.0,
"reward": 0.8203125,
"reward_std": 0.10317442566156387,
"rewards/accuracy_reward": 0.64453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 182
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2138.0,
"completions/max_terminated_length": 2138.0,
"completions/mean_length": 563.7265625,
"completions/mean_terminated_length": 568.1653442382812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.1952,
"grad_norm": 0.002939382568001747,
"learning_rate": 1.0624999999999999e-07,
"loss": -0.0052,
"num_tokens": 31402602.0,
"reward": 0.828125,
"reward_std": 0.152683824300766,
"rewards/accuracy_reward": 0.6640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 183
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2148.0,
"completions/max_terminated_length": 2148.0,
"completions/mean_length": 555.3515625,
"completions/mean_terminated_length": 555.3515625,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"epoch": 0.19626666666666667,
"grad_norm": 0.0018974288832396269,
"learning_rate": 1e-07,
"loss": 0.0021,
"num_tokens": 31586564.0,
"reward": 0.841796875,
"reward_std": 0.11152756214141846,
"rewards/accuracy_reward": 0.68359375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 184
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3057.0,
"completions/max_terminated_length": 3057.0,
"completions/mean_length": 559.5703125,
"completions/mean_terminated_length": 563.9763793945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.19733333333333333,
"grad_norm": 0.002806787844747305,
"learning_rate": 9.375e-08,
"loss": 0.002,
"num_tokens": 31773246.0,
"reward": 0.822265625,
"reward_std": 0.150010883808136,
"rewards/accuracy_reward": 0.66015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 185
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2895.0,
"completions/max_terminated_length": 2895.0,
"completions/mean_length": 561.984375,
"completions/mean_terminated_length": 568.6482543945312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.1984,
"grad_norm": 0.002493697451427579,
"learning_rate": 8.75e-08,
"loss": -0.0049,
"num_tokens": 31958666.0,
"reward": 0.826171875,
"reward_std": 0.12300161272287369,
"rewards/accuracy_reward": 0.671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98046875,
"rewards/mean_confidence_reward": 0.0,
"step": 186
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2313.0,
"completions/max_terminated_length": 2313.0,
"completions/mean_length": 572.26953125,
"completions/mean_terminated_length": 581.3532104492188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.19946666666666665,
"grad_norm": 0.0027995705604553223,
"learning_rate": 8.125e-08,
"loss": -0.0067,
"num_tokens": 32143223.0,
"reward": 0.787109375,
"reward_std": 0.17532166838645935,
"rewards/accuracy_reward": 0.58984375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 187
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2866.0,
"completions/max_terminated_length": 2866.0,
"completions/mean_length": 618.55859375,
"completions/mean_terminated_length": 618.55859375,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.20053333333333334,
"grad_norm": 0.0020447983406484127,
"learning_rate": 7.5e-08,
"loss": 0.0019,
"num_tokens": 32342158.0,
"reward": 0.849609375,
"reward_std": 0.09593826532363892,
"rewards/accuracy_reward": 0.69921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 188
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2231.0,
"completions/max_terminated_length": 2231.0,
"completions/mean_length": 553.40234375,
"completions/mean_terminated_length": 553.40234375,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.2016,
"grad_norm": 0.0024205332156270742,
"learning_rate": 6.875e-08,
"loss": -0.0012,
"num_tokens": 32528109.0,
"reward": 0.798828125,
"reward_std": 0.12132295966148376,
"rewards/accuracy_reward": 0.59765625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 189
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1671.0,
"completions/max_terminated_length": 1671.0,
"completions/mean_length": 599.33984375,
"completions/mean_terminated_length": 601.6902465820312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.20266666666666666,
"grad_norm": 0.002275430364534259,
"learning_rate": 6.25e-08,
"loss": -0.002,
"num_tokens": 32723660.0,
"reward": 0.814453125,
"reward_std": 0.13816100358963013,
"rewards/accuracy_reward": 0.6328125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 190
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2994.0,
"completions/max_terminated_length": 2994.0,
"completions/mean_length": 575.39453125,
"completions/mean_terminated_length": 577.6510009765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.20373333333333332,
"grad_norm": 0.002242535352706909,
"learning_rate": 5.625e-08,
"loss": -0.0012,
"num_tokens": 32911641.0,
"reward": 0.7890625,
"reward_std": 0.10681800544261932,
"rewards/accuracy_reward": 0.59375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 191
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2673.0,
"completions/max_terminated_length": 2673.0,
"completions/mean_length": 556.78125,
"completions/mean_terminated_length": 558.9647216796875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.2048,
"grad_norm": 0.002518382389098406,
"learning_rate": 5e-08,
"loss": -0.0033,
"num_tokens": 33095665.0,
"reward": 0.810546875,
"reward_std": 0.1467379629611969,
"rewards/accuracy_reward": 0.62890625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 192
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2173.0,
"completions/max_terminated_length": 2173.0,
"completions/mean_length": 549.875,
"completions/mean_terminated_length": 554.2047119140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.20586666666666667,
"grad_norm": 0.0030763852410018444,
"learning_rate": 4.375e-08,
"loss": 0.001,
"num_tokens": 33278657.0,
"reward": 0.779296875,
"reward_std": 0.18709099292755127,
"rewards/accuracy_reward": 0.56640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 193
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 1513.0,
"completions/max_terminated_length": 1513.0,
"completions/mean_length": 491.72265625,
"completions/mean_terminated_length": 495.594482421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.20693333333333333,
"grad_norm": 0.0022165800910443068,
"learning_rate": 3.75e-08,
"loss": -0.0025,
"num_tokens": 33446994.0,
"reward": 0.794921875,
"reward_std": 0.10330984741449356,
"rewards/accuracy_reward": 0.6015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 194
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2444.0,
"completions/max_terminated_length": 2444.0,
"completions/mean_length": 579.046875,
"completions/mean_terminated_length": 581.3176879882812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.208,
"grad_norm": 0.0022793509997427464,
"learning_rate": 3.125e-08,
"loss": 0.0012,
"num_tokens": 33637726.0,
"reward": 0.8125,
"reward_std": 0.13170567154884338,
"rewards/accuracy_reward": 0.62890625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 195
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1295.0,
"completions/max_terminated_length": 1295.0,
"completions/mean_length": 437.84765625,
"completions/mean_terminated_length": 437.84765625,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.20906666666666668,
"grad_norm": 0.002784677315503359,
"learning_rate": 2.5e-08,
"loss": 0.0039,
"num_tokens": 33788871.0,
"reward": 0.82421875,
"reward_std": 0.08995966613292694,
"rewards/accuracy_reward": 0.6484375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.0,
"step": 196
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2686.0,
"completions/max_terminated_length": 2686.0,
"completions/mean_length": 554.48828125,
"completions/mean_terminated_length": 561.0632934570312,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.21013333333333334,
"grad_norm": 0.0034406990744173527,
"learning_rate": 1.875e-08,
"loss": -0.0011,
"num_tokens": 33972388.0,
"reward": 0.80859375,
"reward_std": 0.14626148343086243,
"rewards/accuracy_reward": 0.6328125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 197
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2712.0,
"completions/max_terminated_length": 2712.0,
"completions/mean_length": 507.5078125,
"completions/mean_terminated_length": 509.4980773925781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.2112,
"grad_norm": 0.002338421531021595,
"learning_rate": 1.25e-08,
"loss": 0.0021,
"num_tokens": 34144206.0,
"reward": 0.806640625,
"reward_std": 0.12046800553798676,
"rewards/accuracy_reward": 0.62109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 198
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2017.0,
"completions/max_terminated_length": 2017.0,
"completions/mean_length": 544.703125,
"completions/mean_terminated_length": 548.9921264648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.21226666666666666,
"grad_norm": 0.0036452198401093483,
"learning_rate": 6.25e-09,
"loss": -0.0031,
"num_tokens": 34324362.0,
"reward": 0.82421875,
"reward_std": 0.12910333275794983,
"rewards/accuracy_reward": 0.65625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 199
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 1704.0,
"completions/max_terminated_length": 1704.0,
"completions/mean_length": 540.6875,
"completions/mean_terminated_length": 542.807861328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.21333333333333335,
"grad_norm": 0.0033806608989834785,
"learning_rate": 0.0,
"loss": 0.0028,
"num_tokens": 34507338.0,
"reward": 0.802734375,
"reward_std": 0.12625747919082642,
"rewards/accuracy_reward": 0.6171875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 200
},
{
"epoch": 0.21333333333333335,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.0003638234540994745,
"train_runtime": 15513.9302,
"train_samples_per_second": 3.3,
"train_steps_per_second": 0.013
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 34507338,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}