Files
GRPO-7B-long-step-hotpot/trainer_state.json
ModelHub XC 69bc612eeb 初始化项目,由ModelHub XC社区提供模型
Model: zhaohq/GRPO-7B-long-step-hotpot
Source: Original Platform
2026-06-04 18:30:19 +08:00

6885 lines
260 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.32,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1640625,
"completions/max_length": 1000.0,
"completions/max_terminated_length": 1000.0,
"completions/mean_length": 341.4609375,
"completions/mean_terminated_length": 408.47662353515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0016,
"grad_norm": 0.17645138502120972,
"learning_rate": 5e-08,
"loss": -0.116,
"num_tokens": 486582.0,
"reward": 0.41310209035873413,
"reward_std": 0.4805126190185547,
"rewards/accuracy_reward_long_step": 0.2265625,
"rewards/final_brier_reward_long_step": 0.11814829707145691,
"rewards/format_reward_long_step": 0.23046875,
"rewards/stepwise_brier_reward_long_step": 0.1670725792646408,
"step": 1
},
{
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.00390625,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.19140625,
"completions/max_length": 1019.0,
"completions/max_terminated_length": 1019.0,
"completions/mean_length": 303.75,
"completions/mean_terminated_length": 375.65216064453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0032,
"grad_norm": 0.6896445155143738,
"learning_rate": 1e-07,
"loss": -0.1486,
"num_tokens": 985630.0,
"reward": 0.4098304212093353,
"reward_std": 0.5015645623207092,
"rewards/accuracy_reward_long_step": 0.1875,
"rewards/final_brier_reward_long_step": 0.1355031430721283,
"rewards/format_reward_long_step": 0.27734375,
"rewards/stepwise_brier_reward_long_step": 0.1991310715675354,
"step": 2
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 1001.0,
"completions/max_terminated_length": 1001.0,
"completions/mean_length": 327.15234375,
"completions/mean_terminated_length": 402.6490478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0048,
"grad_norm": 0.40888160467147827,
"learning_rate": 1.5e-07,
"loss": -0.1498,
"num_tokens": 1484085.0,
"reward": 0.45745182037353516,
"reward_std": 0.5877432227134705,
"rewards/accuracy_reward_long_step": 0.21484375,
"rewards/final_brier_reward_long_step": 0.15137070417404175,
"rewards/format_reward_long_step": 0.2890625,
"rewards/stepwise_brier_reward_long_step": 0.24093663692474365,
"step": 3
},
{
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.75,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.00390625,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.15000000000000002,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20703125,
"completions/max_length": 1003.0,
"completions/max_terminated_length": 1003.0,
"completions/mean_length": 292.12890625,
"completions/mean_terminated_length": 368.3990173339844,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0064,
"grad_norm": 5.972024917602539,
"learning_rate": 2e-07,
"loss": -0.1685,
"num_tokens": 1985870.0,
"reward": 0.3280823230743408,
"reward_std": 0.4515482783317566,
"rewards/accuracy_reward_long_step": 0.15625,
"rewards/final_brier_reward_long_step": 0.10437265783548355,
"rewards/format_reward_long_step": 0.21484375,
"rewards/stepwise_brier_reward_long_step": 0.15326906740665436,
"step": 4
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.75,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15234375,
"completions/max_length": 1017.0,
"completions/max_terminated_length": 1017.0,
"completions/mean_length": 338.7734375,
"completions/mean_terminated_length": 399.65899658203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.008,
"grad_norm": 0.17540286481380463,
"learning_rate": 2.5e-07,
"loss": -0.0985,
"num_tokens": 2508156.0,
"reward": 0.3251497149467468,
"reward_std": 0.4245316982269287,
"rewards/accuracy_reward_long_step": 0.14453125,
"rewards/final_brier_reward_long_step": 0.12149253487586975,
"rewards/format_reward_long_step": 0.22265625,
"rewards/stepwise_brier_reward_long_step": 0.15566879510879517,
"step": 5
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13671875,
"completions/max_length": 994.0,
"completions/max_terminated_length": 994.0,
"completions/mean_length": 347.57421875,
"completions/mean_terminated_length": 402.61993408203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0096,
"grad_norm": 0.40559351444244385,
"learning_rate": 3e-07,
"loss": -0.1487,
"num_tokens": 3024799.0,
"reward": 0.39223921298980713,
"reward_std": 0.5162125825881958,
"rewards/accuracy_reward_long_step": 0.1640625,
"rewards/final_brier_reward_long_step": 0.13095274567604065,
"rewards/format_reward_long_step": 0.28515625,
"rewards/stepwise_brier_reward_long_step": 0.21144148707389832,
"step": 6
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 973.0,
"completions/max_terminated_length": 973.0,
"completions/mean_length": 341.73828125,
"completions/mean_terminated_length": 390.55804443359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0112,
"grad_norm": 0.18455225229263306,
"learning_rate": 3.5e-07,
"loss": -0.0943,
"num_tokens": 3531764.0,
"reward": 0.3882223963737488,
"reward_std": 0.5209769606590271,
"rewards/accuracy_reward_long_step": 0.171875,
"rewards/final_brier_reward_long_step": 0.1431557536125183,
"rewards/format_reward_long_step": 0.265625,
"rewards/stepwise_brier_reward_long_step": 0.1909838318824768,
"step": 7
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.5,
"calib/mean_conf": 0.925,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.024999999999999967,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13671875,
"completions/max_length": 997.0,
"completions/max_terminated_length": 997.0,
"completions/mean_length": 329.87890625,
"completions/mean_terminated_length": 382.1221923828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0128,
"grad_norm": 0.214069664478302,
"learning_rate": 4e-07,
"loss": -0.114,
"num_tokens": 4020813.0,
"reward": 0.4169233441352844,
"reward_std": 0.530878484249115,
"rewards/accuracy_reward_long_step": 0.203125,
"rewards/final_brier_reward_long_step": 0.12548723816871643,
"rewards/format_reward_long_step": 0.265625,
"rewards/stepwise_brier_reward_long_step": 0.19845610857009888,
"step": 8
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.2,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.18359375,
"completions/max_length": 1013.0,
"completions/max_terminated_length": 1013.0,
"completions/mean_length": 327.0546875,
"completions/mean_terminated_length": 400.60284423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0144,
"grad_norm": 0.5599529147148132,
"learning_rate": 4.5e-07,
"loss": -0.1094,
"num_tokens": 4537731.0,
"reward": 0.26959529519081116,
"reward_std": 0.40018031001091003,
"rewards/accuracy_reward_long_step": 0.07421875,
"rewards/final_brier_reward_long_step": 0.102345310151577,
"rewards/format_reward_long_step": 0.25,
"rewards/stepwise_brier_reward_long_step": 0.17916086316108704,
"step": 9
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.9,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.12890625,
"completions/max_length": 1003.0,
"completions/max_terminated_length": 1003.0,
"completions/mean_length": 344.32421875,
"completions/mean_terminated_length": 395.2780456542969,
"completions/min_length": 0.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.016,
"grad_norm": 0.06352151185274124,
"learning_rate": 5e-07,
"loss": -0.081,
"num_tokens": 5045750.0,
"reward": 0.34384429454803467,
"reward_std": 0.42729049921035767,
"rewards/accuracy_reward_long_step": 0.14453125,
"rewards/final_brier_reward_long_step": 0.11546708643436432,
"rewards/format_reward_long_step": 0.2578125,
"rewards/stepwise_brier_reward_long_step": 0.166159987449646,
"step": 10
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.0,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1953125,
"completions/max_length": 1004.0,
"completions/max_terminated_length": 1004.0,
"completions/mean_length": 309.78125,
"completions/mean_terminated_length": 384.97088623046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.0176,
"grad_norm": 0.07709571719169617,
"learning_rate": 5.5e-07,
"loss": -0.1551,
"num_tokens": 5556670.0,
"reward": 0.314059853553772,
"reward_std": 0.4106077551841736,
"rewards/accuracy_reward_long_step": 0.125,
"rewards/final_brier_reward_long_step": 0.0948641449213028,
"rewards/format_reward_long_step": 0.24609375,
"rewards/stepwise_brier_reward_long_step": 0.1691877841949463,
"step": 11
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1953125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1024.0,
"completions/mean_length": 336.19921875,
"completions/mean_terminated_length": 417.80096435546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0192,
"grad_norm": 0.7987160086631775,
"learning_rate": 6e-07,
"loss": -0.1555,
"num_tokens": 6065281.0,
"reward": 0.388057678937912,
"reward_std": 0.4589148163795471,
"rewards/accuracy_reward_long_step": 0.2265625,
"rewards/final_brier_reward_long_step": 0.08411991596221924,
"rewards/format_reward_long_step": 0.203125,
"rewards/stepwise_brier_reward_long_step": 0.15561071038246155,
"step": 12
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.19140625,
"completions/max_length": 979.0,
"completions/max_terminated_length": 979.0,
"completions/mean_length": 305.63671875,
"completions/mean_terminated_length": 377.9855041503906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0208,
"grad_norm": 0.14753887057304382,
"learning_rate": 6.5e-07,
"loss": -0.1082,
"num_tokens": 6576452.0,
"reward": 0.4127691388130188,
"reward_std": 0.4886016845703125,
"rewards/accuracy_reward_long_step": 0.18359375,
"rewards/final_brier_reward_long_step": 0.1466279774904251,
"rewards/format_reward_long_step": 0.28515625,
"rewards/stepwise_brier_reward_long_step": 0.1997610330581665,
"step": 13
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0078125,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.5650000000000001,
"calib/nonempty_final_conf_rate": 0.0078125,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.23500000000000001,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16796875,
"completions/max_length": 1021.0,
"completions/max_terminated_length": 1021.0,
"completions/mean_length": 324.9140625,
"completions/mean_terminated_length": 390.5070495605469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.0224,
"grad_norm": 0.14357596635818481,
"learning_rate": 7e-07,
"loss": -0.1369,
"num_tokens": 7091094.0,
"reward": 0.5405763387680054,
"reward_std": 0.5295156836509705,
"rewards/accuracy_reward_long_step": 0.26171875,
"rewards/final_brier_reward_long_step": 0.17839357256889343,
"rewards/format_reward_long_step": 0.34765625,
"rewards/stepwise_brier_reward_long_step": 0.24172407388687134,
"step": 14
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.12109375,
"completions/max_length": 1002.0,
"completions/max_terminated_length": 1002.0,
"completions/mean_length": 351.58984375,
"completions/mean_terminated_length": 400.0311279296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.024,
"grad_norm": 0.15006816387176514,
"learning_rate": 7.5e-07,
"loss": -0.0696,
"num_tokens": 7608909.0,
"reward": 0.4260837733745575,
"reward_std": 0.49461331963539124,
"rewards/accuracy_reward_long_step": 0.1328125,
"rewards/final_brier_reward_long_step": 0.1504545956850052,
"rewards/format_reward_long_step": 0.3828125,
"rewards/stepwise_brier_reward_long_step": 0.2570054233074188,
"step": 15
},
{
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.00390625,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 966.0,
"completions/max_terminated_length": 966.0,
"completions/mean_length": 354.89453125,
"completions/mean_terminated_length": 412.9681701660156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.0256,
"grad_norm": 0.11908330768346786,
"learning_rate": 8e-07,
"loss": -0.0647,
"num_tokens": 8142738.0,
"reward": 0.45549800992012024,
"reward_std": 0.5145381093025208,
"rewards/accuracy_reward_long_step": 0.1953125,
"rewards/final_brier_reward_long_step": 0.14264921844005585,
"rewards/format_reward_long_step": 0.33203125,
"rewards/stepwise_brier_reward_long_step": 0.23403030633926392,
"step": 16
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.5,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 1012.0,
"completions/max_terminated_length": 1012.0,
"completions/mean_length": 327.94140625,
"completions/mean_terminated_length": 381.6045227050781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0272,
"grad_norm": 0.10344522446393967,
"learning_rate": 8.499999999999999e-07,
"loss": -0.1339,
"num_tokens": 8624011.0,
"reward": 0.4539112448692322,
"reward_std": 0.535567045211792,
"rewards/accuracy_reward_long_step": 0.15625,
"rewards/final_brier_reward_long_step": 0.16395819187164307,
"rewards/format_reward_long_step": 0.37890625,
"rewards/stepwise_brier_reward_long_step": 0.26887428760528564,
"step": 17
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 1008.0,
"completions/max_terminated_length": 1008.0,
"completions/mean_length": 356.265625,
"completions/mean_terminated_length": 400.0175476074219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.0288,
"grad_norm": 0.06071805581450462,
"learning_rate": 9e-07,
"loss": -0.0799,
"num_tokens": 9139559.0,
"reward": 0.47327494621276855,
"reward_std": 0.4894096851348877,
"rewards/accuracy_reward_long_step": 0.15625,
"rewards/final_brier_reward_long_step": 0.1746734380722046,
"rewards/format_reward_long_step": 0.3984375,
"rewards/stepwise_brier_reward_long_step": 0.2965514063835144,
"step": 18
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.5,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07421875,
"completions/max_length": 962.0,
"completions/max_terminated_length": 962.0,
"completions/mean_length": 345.92578125,
"completions/mean_terminated_length": 373.658203125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0304,
"grad_norm": 0.04385017603635788,
"learning_rate": 9.499999999999999e-07,
"loss": -0.0804,
"num_tokens": 9656020.0,
"reward": 0.5117301940917969,
"reward_std": 0.5140496492385864,
"rewards/accuracy_reward_long_step": 0.1875,
"rewards/final_brier_reward_long_step": 0.1626010686159134,
"rewards/format_reward_long_step": 0.4140625,
"rewards/stepwise_brier_reward_long_step": 0.3061947822570801,
"step": 19
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.35,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 1015.0,
"completions/max_terminated_length": 1015.0,
"completions/mean_length": 345.9765625,
"completions/mean_terminated_length": 388.46490478515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 91.0,
"epoch": 0.032,
"grad_norm": 0.12682317197322845,
"learning_rate": 1e-06,
"loss": -0.1204,
"num_tokens": 10172294.0,
"reward": 0.5370358228683472,
"reward_std": 0.5269090533256531,
"rewards/accuracy_reward_long_step": 0.1953125,
"rewards/final_brier_reward_long_step": 0.17095977067947388,
"rewards/format_reward_long_step": 0.44921875,
"rewards/stepwise_brier_reward_long_step": 0.29749590158462524,
"step": 20
},
{
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.00390625,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1171875,
"completions/max_length": 1007.0,
"completions/max_terminated_length": 1007.0,
"completions/mean_length": 326.0625,
"completions/mean_terminated_length": 369.3451232910156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.0336,
"grad_norm": 0.1039290726184845,
"learning_rate": 9.944444444444444e-07,
"loss": -0.0887,
"num_tokens": 10682942.0,
"reward": 0.6263545155525208,
"reward_std": 0.5448290109634399,
"rewards/accuracy_reward_long_step": 0.1953125,
"rewards/final_brier_reward_long_step": 0.23252148926258087,
"rewards/format_reward_long_step": 0.55078125,
"rewards/stepwise_brier_reward_long_step": 0.39008423686027527,
"step": 21
},
{
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.00390625,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 967.0,
"completions/max_terminated_length": 967.0,
"completions/mean_length": 318.80859375,
"completions/mean_terminated_length": 354.8478088378906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.0352,
"grad_norm": 0.07774824649095535,
"learning_rate": 9.88888888888889e-07,
"loss": -0.0844,
"num_tokens": 11188405.0,
"reward": 0.5633851885795593,
"reward_std": 0.480153888463974,
"rewards/accuracy_reward_long_step": 0.16015625,
"rewards/final_brier_reward_long_step": 0.20337249338626862,
"rewards/format_reward_long_step": 0.515625,
"rewards/stepwise_brier_reward_long_step": 0.37829315662384033,
"step": 22
},
{
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.00390625,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 1003.0,
"completions/max_terminated_length": 1003.0,
"completions/mean_length": 321.98828125,
"completions/mean_terminated_length": 355.29742431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.0368,
"grad_norm": 0.06589806079864502,
"learning_rate": 9.833333333333332e-07,
"loss": -0.0568,
"num_tokens": 11700754.0,
"reward": 0.7173271179199219,
"reward_std": 0.5148348808288574,
"rewards/accuracy_reward_long_step": 0.25,
"rewards/final_brier_reward_long_step": 0.2776620388031006,
"rewards/format_reward_long_step": 0.60546875,
"rewards/stepwise_brier_reward_long_step": 0.3807087540626526,
"step": 23
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 910.0,
"completions/max_terminated_length": 910.0,
"completions/mean_length": 338.0546875,
"completions/mean_terminated_length": 351.7967224121094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.0384,
"grad_norm": 0.2670823037624359,
"learning_rate": 9.777777777777778e-07,
"loss": -0.066,
"num_tokens": 12206096.0,
"reward": 0.799688994884491,
"reward_std": 0.49856090545654297,
"rewards/accuracy_reward_long_step": 0.2578125,
"rewards/final_brier_reward_long_step": 0.3083125054836273,
"rewards/format_reward_long_step": 0.66796875,
"rewards/stepwise_brier_reward_long_step": 0.5232560634613037,
"step": 24
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 0.0,
"calib/mean_conf": 0.5,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 985.0,
"completions/max_terminated_length": 985.0,
"completions/mean_length": 337.97265625,
"completions/mean_terminated_length": 353.14691162109375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.04,
"grad_norm": 0.12033725529909134,
"learning_rate": 9.722222222222222e-07,
"loss": -0.0464,
"num_tokens": 12725273.0,
"reward": 0.7678510546684265,
"reward_std": 0.5629936456680298,
"rewards/accuracy_reward_long_step": 0.234375,
"rewards/final_brier_reward_long_step": 0.31047940254211426,
"rewards/format_reward_long_step": 0.671875,
"rewards/stepwise_brier_reward_long_step": 0.479674756526947,
"step": 25
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 990.0,
"completions/max_terminated_length": 990.0,
"completions/mean_length": 329.58984375,
"completions/mean_terminated_length": 340.2217712402344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.0416,
"grad_norm": 0.05781063064932823,
"learning_rate": 9.666666666666666e-07,
"loss": -0.0628,
"num_tokens": 13247448.0,
"reward": 0.8087503910064697,
"reward_std": 0.5137581825256348,
"rewards/accuracy_reward_long_step": 0.25390625,
"rewards/final_brier_reward_long_step": 0.32855507731437683,
"rewards/format_reward_long_step": 0.703125,
"rewards/stepwise_brier_reward_long_step": 0.484571635723114,
"step": 26
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.95,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 848.0,
"completions/max_terminated_length": 848.0,
"completions/mean_length": 294.39453125,
"completions/mean_terminated_length": 308.8729248046875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.0432,
"grad_norm": 0.1369597166776657,
"learning_rate": 9.61111111111111e-07,
"loss": -0.0484,
"num_tokens": 13739733.0,
"reward": 0.7955328226089478,
"reward_std": 0.45239853858947754,
"rewards/accuracy_reward_long_step": 0.18359375,
"rewards/final_brier_reward_long_step": 0.35069864988327026,
"rewards/format_reward_long_step": 0.78515625,
"rewards/stepwise_brier_reward_long_step": 0.526745080947876,
"step": 27
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 847.0,
"completions/max_terminated_length": 847.0,
"completions/mean_length": 299.19140625,
"completions/mean_terminated_length": 307.6023864746094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.0448,
"grad_norm": 0.042039766907691956,
"learning_rate": 9.555555555555556e-07,
"loss": -0.0462,
"num_tokens": 14235238.0,
"reward": 0.8760055303573608,
"reward_std": 0.4597265124320984,
"rewards/accuracy_reward_long_step": 0.26171875,
"rewards/final_brier_reward_long_step": 0.3385574221611023,
"rewards/format_reward_long_step": 0.78515625,
"rewards/stepwise_brier_reward_long_step": 0.5482772588729858,
"step": 28
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.95,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 968.0,
"completions/max_terminated_length": 968.0,
"completions/mean_length": 307.88671875,
"completions/mean_terminated_length": 316.5421447753906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.0464,
"grad_norm": 0.17529161274433136,
"learning_rate": 9.499999999999999e-07,
"loss": -0.06,
"num_tokens": 14747433.0,
"reward": 0.8551455736160278,
"reward_std": 0.4609827995300293,
"rewards/accuracy_reward_long_step": 0.22265625,
"rewards/final_brier_reward_long_step": 0.3258519172668457,
"rewards/format_reward_long_step": 0.8125,
"rewards/stepwise_brier_reward_long_step": 0.5791054964065552,
"step": 29
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 910.0,
"completions/max_terminated_length": 910.0,
"completions/mean_length": 292.421875,
"completions/mean_terminated_length": 299.44000244140625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.048,
"grad_norm": 0.05863456800580025,
"learning_rate": 9.444444444444444e-07,
"loss": -0.0448,
"num_tokens": 15236589.0,
"reward": 1.0201013088226318,
"reward_std": 0.43484771251678467,
"rewards/accuracy_reward_long_step": 0.32421875,
"rewards/final_brier_reward_long_step": 0.4454140067100525,
"rewards/format_reward_long_step": 0.8671875,
"rewards/stepwise_brier_reward_long_step": 0.6037412881851196,
"step": 30
},
{
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.00390625,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 639.0,
"completions/max_terminated_length": 639.0,
"completions/mean_length": 284.203125,
"completions/mean_terminated_length": 286.4409484863281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.0496,
"grad_norm": 0.06036984175443649,
"learning_rate": 9.388888888888888e-07,
"loss": -0.0118,
"num_tokens": 15728881.0,
"reward": 1.0137848854064941,
"reward_std": 0.45152291655540466,
"rewards/accuracy_reward_long_step": 0.296875,
"rewards/final_brier_reward_long_step": 0.4248659014701843,
"rewards/format_reward_long_step": 0.88671875,
"rewards/stepwise_brier_reward_long_step": 0.6693359017372131,
"step": 31
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 895.0,
"completions/max_terminated_length": 895.0,
"completions/mean_length": 287.9296875,
"completions/mean_terminated_length": 296.0240783691406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 63.0,
"epoch": 0.0512,
"grad_norm": 0.18194638192653656,
"learning_rate": 9.333333333333333e-07,
"loss": -0.0144,
"num_tokens": 16213351.0,
"reward": 0.8365803360939026,
"reward_std": 0.42848724126815796,
"rewards/accuracy_reward_long_step": 0.19921875,
"rewards/final_brier_reward_long_step": 0.34980231523513794,
"rewards/format_reward_long_step": 0.83203125,
"rewards/stepwise_brier_reward_long_step": 0.5355815887451172,
"step": 32
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 893.0,
"completions/max_terminated_length": 893.0,
"completions/mean_length": 298.71875,
"completions/mean_terminated_length": 305.88800048828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.0528,
"grad_norm": 0.11843208968639374,
"learning_rate": 9.277777777777777e-07,
"loss": -0.0299,
"num_tokens": 16715199.0,
"reward": 0.9657071232795715,
"reward_std": 0.40177035331726074,
"rewards/accuracy_reward_long_step": 0.265625,
"rewards/final_brier_reward_long_step": 0.4098663926124573,
"rewards/format_reward_long_step": 0.8984375,
"rewards/stepwise_brier_reward_long_step": 0.5935869812965393,
"step": 33
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 744.0,
"completions/max_terminated_length": 744.0,
"completions/mean_length": 286.67578125,
"completions/mean_terminated_length": 290.0751037597656,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.0544,
"grad_norm": 0.052849605679512024,
"learning_rate": 9.222222222222222e-07,
"loss": -0.0391,
"num_tokens": 17214404.0,
"reward": 0.9088114500045776,
"reward_std": 0.4241497814655304,
"rewards/accuracy_reward_long_step": 0.2265625,
"rewards/final_brier_reward_long_step": 0.37564724683761597,
"rewards/format_reward_long_step": 0.875,
"rewards/stepwise_brier_reward_long_step": 0.6033484935760498,
"step": 34
},
{
"calib/answer_extract_rate": 0.00390625,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.00390625,
"calib/format_rate": 0.0,
"calib/frac_conf_gt_0.9": 1.0,
"calib/mean_conf": 0.95,
"calib/nonempty_final_conf_rate": 0.00390625,
"calib/nonempty_reasoning_rate": 0.00390625,
"calib/nonempty_step_conf_rate": 0.0,
"calib/std_conf": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 835.0,
"completions/max_terminated_length": 835.0,
"completions/mean_length": 270.71484375,
"completions/mean_terminated_length": 273.9249267578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.056,
"grad_norm": 0.049951206892728806,
"learning_rate": 9.166666666666665e-07,
"loss": -0.0334,
"num_tokens": 17712787.0,
"reward": 1.1728079319000244,
"reward_std": 0.4307233691215515,
"rewards/accuracy_reward_long_step": 0.3984375,
"rewards/final_brier_reward_long_step": 0.5377765893936157,
"rewards/format_reward_long_step": 0.93359375,
"rewards/stepwise_brier_reward_long_step": 0.6925181150436401,
"step": 35
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 747.0,
"completions/max_terminated_length": 747.0,
"completions/mean_length": 268.5234375,
"completions/mean_terminated_length": 270.6377868652344,
"completions/min_length": 0.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.0576,
"grad_norm": 0.07510220259428024,
"learning_rate": 9.11111111111111e-07,
"loss": -0.0289,
"num_tokens": 18199657.0,
"reward": 1.1637983322143555,
"reward_std": 0.46460413932800293,
"rewards/accuracy_reward_long_step": 0.390625,
"rewards/final_brier_reward_long_step": 0.5209541916847229,
"rewards/format_reward_long_step": 0.9375,
"rewards/stepwise_brier_reward_long_step": 0.6967387199401855,
"step": 36
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 746.0,
"completions/max_terminated_length": 746.0,
"completions/mean_length": 282.359375,
"completions/mean_terminated_length": 282.359375,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.0592,
"grad_norm": 0.15866929292678833,
"learning_rate": 9.055555555555556e-07,
"loss": 0.0026,
"num_tokens": 18698421.0,
"reward": 1.1005845069885254,
"reward_std": 0.40720871090888977,
"rewards/accuracy_reward_long_step": 0.328125,
"rewards/final_brier_reward_long_step": 0.5032482743263245,
"rewards/format_reward_long_step": 0.9375,
"rewards/stepwise_brier_reward_long_step": 0.7115898132324219,
"step": 37
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 839.0,
"completions/max_terminated_length": 839.0,
"completions/mean_length": 274.265625,
"completions/mean_terminated_length": 275.3411865234375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.0608,
"grad_norm": 0.17633329331874847,
"learning_rate": 9e-07,
"loss": -0.0303,
"num_tokens": 19198241.0,
"reward": 1.034214973449707,
"reward_std": 0.37503814697265625,
"rewards/accuracy_reward_long_step": 0.26171875,
"rewards/final_brier_reward_long_step": 0.45626088976860046,
"rewards/format_reward_long_step": 0.95703125,
"rewards/stepwise_brier_reward_long_step": 0.7196618318557739,
"step": 38
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 909.0,
"completions/max_terminated_length": 909.0,
"completions/mean_length": 266.796875,
"completions/mean_terminated_length": 269.9604797363281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.0624,
"grad_norm": 0.1747618317604065,
"learning_rate": 8.944444444444445e-07,
"loss": -0.0408,
"num_tokens": 19687013.0,
"reward": 0.9186801910400391,
"reward_std": 0.33332592248916626,
"rewards/accuracy_reward_long_step": 0.203125,
"rewards/final_brier_reward_long_step": 0.42509374022483826,
"rewards/format_reward_long_step": 0.94140625,
"rewards/stepwise_brier_reward_long_step": 0.5543146133422852,
"step": 39
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 692.0,
"completions/max_terminated_length": 692.0,
"completions/mean_length": 269.41796875,
"completions/mean_terminated_length": 272.6126708984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 47.0,
"epoch": 0.064,
"grad_norm": 0.4975915253162384,
"learning_rate": 8.888888888888888e-07,
"loss": -0.0294,
"num_tokens": 20171848.0,
"reward": 1.0980305671691895,
"reward_std": 0.3671276569366455,
"rewards/accuracy_reward_long_step": 0.32421875,
"rewards/final_brier_reward_long_step": 0.5042629241943359,
"rewards/format_reward_long_step": 0.9296875,
"rewards/stepwise_brier_reward_long_step": 0.7316096425056458,
"step": 40
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 790.0,
"completions/max_terminated_length": 790.0,
"completions/mean_length": 276.7265625,
"completions/mean_terminated_length": 276.7265625,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.0656,
"grad_norm": 0.19570253789424896,
"learning_rate": 8.833333333333333e-07,
"loss": -0.0075,
"num_tokens": 20676338.0,
"reward": 0.9547429084777832,
"reward_std": 0.35480546951293945,
"rewards/accuracy_reward_long_step": 0.2265625,
"rewards/final_brier_reward_long_step": 0.4319256842136383,
"rewards/format_reward_long_step": 0.91796875,
"rewards/stepwise_brier_reward_long_step": 0.6448584794998169,
"step": 41
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 708.0,
"completions/max_terminated_length": 708.0,
"completions/mean_length": 271.44140625,
"completions/mean_terminated_length": 272.5058898925781,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.0672,
"grad_norm": 0.09999877214431763,
"learning_rate": 8.777777777777777e-07,
"loss": 0.0075,
"num_tokens": 21162843.0,
"reward": 1.261709213256836,
"reward_std": 0.47088778018951416,
"rewards/accuracy_reward_long_step": 0.47265625,
"rewards/final_brier_reward_long_step": 0.5992207527160645,
"rewards/format_reward_long_step": 0.9296875,
"rewards/stepwise_brier_reward_long_step": 0.697616457939148,
"step": 42
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 634.0,
"completions/max_terminated_length": 634.0,
"completions/mean_length": 268.66015625,
"completions/mean_terminated_length": 268.66015625,
"completions/min_length": 95.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.0688,
"grad_norm": 0.0876559317111969,
"learning_rate": 8.722222222222222e-07,
"loss": -0.0159,
"num_tokens": 21664932.0,
"reward": 1.074408769607544,
"reward_std": 0.3138850927352905,
"rewards/accuracy_reward_long_step": 0.28515625,
"rewards/final_brier_reward_long_step": 0.49867674708366394,
"rewards/format_reward_long_step": 0.95703125,
"rewards/stepwise_brier_reward_long_step": 0.744270920753479,
"step": 43
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 802.0,
"completions/max_terminated_length": 802.0,
"completions/mean_length": 288.703125,
"completions/mean_terminated_length": 288.703125,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.0704,
"grad_norm": 0.27019646763801575,
"learning_rate": 8.666666666666667e-07,
"loss": 0.0153,
"num_tokens": 22174696.0,
"reward": 1.186435580253601,
"reward_std": 0.3493732213973999,
"rewards/accuracy_reward_long_step": 0.40234375,
"rewards/final_brier_reward_long_step": 0.5960390567779541,
"rewards/format_reward_long_step": 0.94921875,
"rewards/stepwise_brier_reward_long_step": 0.6418907642364502,
"step": 44
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 829.0,
"completions/max_terminated_length": 829.0,
"completions/mean_length": 276.6796875,
"completions/mean_terminated_length": 278.8582763671875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.072,
"grad_norm": 0.03910618647933006,
"learning_rate": 8.611111111111111e-07,
"loss": -0.0197,
"num_tokens": 22667598.0,
"reward": 1.0908488035202026,
"reward_std": 0.3936161398887634,
"rewards/accuracy_reward_long_step": 0.30078125,
"rewards/final_brier_reward_long_step": 0.571899950504303,
"rewards/format_reward_long_step": 0.9375,
"rewards/stepwise_brier_reward_long_step": 0.7133700847625732,
"step": 45
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 946.0,
"completions/max_terminated_length": 946.0,
"completions/mean_length": 258.7109375,
"completions/mean_terminated_length": 260.7480163574219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.0736,
"grad_norm": 0.05340059474110603,
"learning_rate": 8.555555555555555e-07,
"loss": -0.0053,
"num_tokens": 23136156.0,
"reward": 1.0639352798461914,
"reward_std": 0.35911956429481506,
"rewards/accuracy_reward_long_step": 0.27734375,
"rewards/final_brier_reward_long_step": 0.5164073705673218,
"rewards/format_reward_long_step": 0.96484375,
"rewards/stepwise_brier_reward_long_step": 0.7002708315849304,
"step": 46
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 750.0,
"completions/max_terminated_length": 750.0,
"completions/mean_length": 258.69140625,
"completions/mean_terminated_length": 258.69140625,
"completions/min_length": 99.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.0752,
"grad_norm": 0.15533719956874847,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0048,
"num_tokens": 23612765.0,
"reward": 1.1923271417617798,
"reward_std": 0.36889374256134033,
"rewards/accuracy_reward_long_step": 0.375,
"rewards/final_brier_reward_long_step": 0.591122031211853,
"rewards/format_reward_long_step": 0.9765625,
"rewards/stepwise_brier_reward_long_step": 0.7250616550445557,
"step": 47
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 983.0,
"completions/max_terminated_length": 983.0,
"completions/mean_length": 264.12890625,
"completions/mean_terminated_length": 266.2086486816406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.0768,
"grad_norm": 0.0998203307390213,
"learning_rate": 8.444444444444444e-07,
"loss": 0.0031,
"num_tokens": 24090910.0,
"reward": 1.1535530090332031,
"reward_std": 0.3134958744049072,
"rewards/accuracy_reward_long_step": 0.328125,
"rewards/final_brier_reward_long_step": 0.6210886240005493,
"rewards/format_reward_long_step": 0.96875,
"rewards/stepwise_brier_reward_long_step": 0.7431235313415527,
"step": 48
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 778.0,
"completions/max_terminated_length": 778.0,
"completions/mean_length": 275.03515625,
"completions/mean_terminated_length": 275.03515625,
"completions/min_length": 120.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.0784,
"grad_norm": 0.09424059092998505,
"learning_rate": 8.388888888888888e-07,
"loss": 0.0141,
"num_tokens": 24596631.0,
"reward": 1.1789720058441162,
"reward_std": 0.3338298797607422,
"rewards/accuracy_reward_long_step": 0.34765625,
"rewards/final_brier_reward_long_step": 0.615998387336731,
"rewards/format_reward_long_step": 0.97265625,
"rewards/stepwise_brier_reward_long_step": 0.7639520764350891,
"step": 49
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 657.0,
"completions/max_terminated_length": 657.0,
"completions/mean_length": 252.19140625,
"completions/mean_terminated_length": 252.19140625,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.08,
"grad_norm": 0.21770058572292328,
"learning_rate": 8.333333333333333e-07,
"loss": -0.0245,
"num_tokens": 25085544.0,
"reward": 1.1415338516235352,
"reward_std": 0.3256559669971466,
"rewards/accuracy_reward_long_step": 0.3203125,
"rewards/final_brier_reward_long_step": 0.6072898507118225,
"rewards/format_reward_long_step": 0.96484375,
"rewards/stepwise_brier_reward_long_step": 0.7479082345962524,
"step": 50
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 948.0,
"completions/max_terminated_length": 948.0,
"completions/mean_length": 254.66796875,
"completions/mean_terminated_length": 256.6732177734375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.0816,
"grad_norm": 0.08060960471630096,
"learning_rate": 8.277777777777777e-07,
"loss": -0.0449,
"num_tokens": 25574811.0,
"reward": 1.2235015630722046,
"reward_std": 0.3895331621170044,
"rewards/accuracy_reward_long_step": 0.39453125,
"rewards/final_brier_reward_long_step": 0.6493412256240845,
"rewards/format_reward_long_step": 0.95703125,
"rewards/stepwise_brier_reward_long_step": 0.7524775266647339,
"step": 51
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 582.0,
"completions/max_terminated_length": 582.0,
"completions/mean_length": 255.12890625,
"completions/mean_terminated_length": 255.12890625,
"completions/min_length": 84.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.0832,
"grad_norm": 0.1500002145767212,
"learning_rate": 8.222222222222221e-07,
"loss": -0.0241,
"num_tokens": 26071732.0,
"reward": 1.1828399896621704,
"reward_std": 0.2660859525203705,
"rewards/accuracy_reward_long_step": 0.34375,
"rewards/final_brier_reward_long_step": 0.6490460634231567,
"rewards/format_reward_long_step": 0.984375,
"rewards/stepwise_brier_reward_long_step": 0.738564133644104,
"step": 52
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 956.0,
"completions/max_terminated_length": 956.0,
"completions/mean_length": 259.86328125,
"completions/mean_terminated_length": 260.8823547363281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.0848,
"grad_norm": 0.093411885201931,
"learning_rate": 8.166666666666666e-07,
"loss": -0.0201,
"num_tokens": 26561105.0,
"reward": 1.0413353443145752,
"reward_std": 0.2801922559738159,
"rewards/accuracy_reward_long_step": 0.22265625,
"rewards/final_brier_reward_long_step": 0.6196457147598267,
"rewards/format_reward_long_step": 0.9609375,
"rewards/stepwise_brier_reward_long_step": 0.7331956624984741,
"step": 53
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 599.0,
"completions/max_terminated_length": 599.0,
"completions/mean_length": 242.04296875,
"completions/mean_terminated_length": 242.04296875,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.0864,
"grad_norm": 0.061436332762241364,
"learning_rate": 8.11111111111111e-07,
"loss": 0.0006,
"num_tokens": 27042404.0,
"reward": 1.1769518852233887,
"reward_std": 0.3248249292373657,
"rewards/accuracy_reward_long_step": 0.33203125,
"rewards/final_brier_reward_long_step": 0.6683531999588013,
"rewards/format_reward_long_step": 0.96875,
"rewards/stepwise_brier_reward_long_step": 0.7738291025161743,
"step": 54
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 530.0,
"completions/max_terminated_length": 530.0,
"completions/mean_length": 239.16796875,
"completions/mean_terminated_length": 240.10589599609375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.088,
"grad_norm": 0.1144290566444397,
"learning_rate": 8.055555555555556e-07,
"loss": -0.0147,
"num_tokens": 27540463.0,
"reward": 1.2522761821746826,
"reward_std": 0.29164189100265503,
"rewards/accuracy_reward_long_step": 0.390625,
"rewards/final_brier_reward_long_step": 0.6936222314834595,
"rewards/format_reward_long_step": 0.98828125,
"rewards/stepwise_brier_reward_long_step": 0.7764202356338501,
"step": 55
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 827.0,
"completions/max_terminated_length": 827.0,
"completions/mean_length": 254.3984375,
"completions/mean_terminated_length": 255.39608764648438,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.0896,
"grad_norm": 0.056342847645282745,
"learning_rate": 8e-07,
"loss": -0.0099,
"num_tokens": 28036997.0,
"reward": 1.2792823314666748,
"reward_std": 0.26004308462142944,
"rewards/accuracy_reward_long_step": 0.421875,
"rewards/final_brier_reward_long_step": 0.7217453122138977,
"rewards/format_reward_long_step": 0.98828125,
"rewards/stepwise_brier_reward_long_step": 0.7313213348388672,
"step": 56
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 549.0,
"completions/max_terminated_length": 549.0,
"completions/mean_length": 247.80859375,
"completions/mean_terminated_length": 247.80859375,
"completions/min_length": 95.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.0912,
"grad_norm": 0.03911300748586655,
"learning_rate": 7.944444444444444e-07,
"loss": 0.0139,
"num_tokens": 28513988.0,
"reward": 1.195192575454712,
"reward_std": 0.32286006212234497,
"rewards/accuracy_reward_long_step": 0.33984375,
"rewards/final_brier_reward_long_step": 0.7113619446754456,
"rewards/format_reward_long_step": 0.98046875,
"rewards/stepwise_brier_reward_long_step": 0.7490957975387573,
"step": 57
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 999.0,
"completions/max_terminated_length": 999.0,
"completions/mean_length": 242.87109375,
"completions/mean_terminated_length": 243.82354736328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.0928,
"grad_norm": 0.10559872537851334,
"learning_rate": 7.888888888888889e-07,
"loss": -0.0377,
"num_tokens": 29001755.0,
"reward": 1.17831289768219,
"reward_std": 0.2993444800376892,
"rewards/accuracy_reward_long_step": 0.3203125,
"rewards/final_brier_reward_long_step": 0.7109405994415283,
"rewards/format_reward_long_step": 0.98828125,
"rewards/stepwise_brier_reward_long_step": 0.7444983720779419,
"step": 58
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 650.0,
"completions/max_terminated_length": 650.0,
"completions/mean_length": 253.3828125,
"completions/mean_terminated_length": 254.37648010253906,
"completions/min_length": 0.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.0944,
"grad_norm": 0.03542870655655861,
"learning_rate": 7.833333333333333e-07,
"loss": -0.0018,
"num_tokens": 29495773.0,
"reward": 1.2244206666946411,
"reward_std": 0.2692364752292633,
"rewards/accuracy_reward_long_step": 0.3515625,
"rewards/final_brier_reward_long_step": 0.7496439814567566,
"rewards/format_reward_long_step": 0.98828125,
"rewards/stepwise_brier_reward_long_step": 0.7652260065078735,
"step": 59
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 611.0,
"completions/max_terminated_length": 611.0,
"completions/mean_length": 245.73046875,
"completions/mean_terminated_length": 245.73046875,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.096,
"grad_norm": 0.04337484389543533,
"learning_rate": 7.777777777777778e-07,
"loss": 0.028,
"num_tokens": 29976952.0,
"reward": 1.199751853942871,
"reward_std": 0.2547294497489929,
"rewards/accuracy_reward_long_step": 0.33203125,
"rewards/final_brier_reward_long_step": 0.7303680181503296,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7561392784118652,
"step": 60
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 749.0,
"completions/max_terminated_length": 749.0,
"completions/mean_length": 247.1015625,
"completions/mean_terminated_length": 248.0706024169922,
"completions/min_length": 0.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.0976,
"grad_norm": 0.06807160377502441,
"learning_rate": 7.722222222222222e-07,
"loss": -0.0207,
"num_tokens": 30471338.0,
"reward": 1.2661097049713135,
"reward_std": 0.2852725088596344,
"rewards/accuracy_reward_long_step": 0.41015625,
"rewards/final_brier_reward_long_step": 0.716038703918457,
"rewards/format_reward_long_step": 0.96875,
"rewards/stepwise_brier_reward_long_step": 0.770275354385376,
"step": 61
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 664.0,
"completions/max_terminated_length": 664.0,
"completions/mean_length": 246.7734375,
"completions/mean_terminated_length": 246.7734375,
"completions/min_length": 97.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.0992,
"grad_norm": 0.03863685578107834,
"learning_rate": 7.666666666666667e-07,
"loss": 0.0143,
"num_tokens": 30953648.0,
"reward": 1.2412984371185303,
"reward_std": 0.21361978352069855,
"rewards/accuracy_reward_long_step": 0.37890625,
"rewards/final_brier_reward_long_step": 0.7338235974311829,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7313702702522278,
"step": 62
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 518.0,
"completions/max_terminated_length": 518.0,
"completions/mean_length": 243.95703125,
"completions/mean_terminated_length": 244.9137420654297,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.1008,
"grad_norm": 0.04952770844101906,
"learning_rate": 7.61111111111111e-07,
"loss": -0.0062,
"num_tokens": 31442845.0,
"reward": 1.2129815816879272,
"reward_std": 0.28704124689102173,
"rewards/accuracy_reward_long_step": 0.33203125,
"rewards/final_brier_reward_long_step": 0.7710623741149902,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7683641314506531,
"step": 63
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 878.0,
"completions/max_terminated_length": 878.0,
"completions/mean_length": 241.94140625,
"completions/mean_terminated_length": 243.84645080566406,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.1024,
"grad_norm": 0.08823659271001816,
"learning_rate": 7.555555555555555e-07,
"loss": -0.0045,
"num_tokens": 31942366.0,
"reward": 1.1511372327804565,
"reward_std": 0.27987030148506165,
"rewards/accuracy_reward_long_step": 0.2734375,
"rewards/final_brier_reward_long_step": 0.7818734645843506,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7445504665374756,
"step": 64
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 980.0,
"completions/max_terminated_length": 980.0,
"completions/mean_length": 246.95703125,
"completions/mean_terminated_length": 247.92550659179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.104,
"grad_norm": 0.03668355569243431,
"learning_rate": 7.5e-07,
"loss": -0.0188,
"num_tokens": 32434235.0,
"reward": 1.180516004562378,
"reward_std": 0.21570606529712677,
"rewards/accuracy_reward_long_step": 0.30859375,
"rewards/final_brier_reward_long_step": 0.7829951047897339,
"rewards/format_reward_long_step": 0.984375,
"rewards/stepwise_brier_reward_long_step": 0.7359441518783569,
"step": 65
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 600.0,
"completions/max_terminated_length": 600.0,
"completions/mean_length": 241.75,
"completions/mean_terminated_length": 241.75,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.1056,
"grad_norm": 0.20867598056793213,
"learning_rate": 7.444444444444444e-07,
"loss": 0.011,
"num_tokens": 32930075.0,
"reward": 1.2010526657104492,
"reward_std": 0.2591190040111542,
"rewards/accuracy_reward_long_step": 0.33203125,
"rewards/final_brier_reward_long_step": 0.7352352142333984,
"rewards/format_reward_long_step": 0.98046875,
"rewards/stepwise_brier_reward_long_step": 0.7799128890037537,
"step": 66
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 817.0,
"completions/max_terminated_length": 817.0,
"completions/mean_length": 237.52734375,
"completions/mean_terminated_length": 237.52734375,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.1072,
"grad_norm": 0.055712711066007614,
"learning_rate": 7.388888888888889e-07,
"loss": 0.0143,
"num_tokens": 33426218.0,
"reward": 1.307477593421936,
"reward_std": 0.2710718512535095,
"rewards/accuracy_reward_long_step": 0.44921875,
"rewards/final_brier_reward_long_step": 0.7135553956031799,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7351051568984985,
"step": 67
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 983.0,
"completions/max_terminated_length": 983.0,
"completions/mean_length": 251.796875,
"completions/mean_terminated_length": 251.796875,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.1088,
"grad_norm": 0.060389939695596695,
"learning_rate": 7.333333333333332e-07,
"loss": 0.0096,
"num_tokens": 33915718.0,
"reward": 1.2436058521270752,
"reward_std": 0.32295602560043335,
"rewards/accuracy_reward_long_step": 0.3984375,
"rewards/final_brier_reward_long_step": 0.717818021774292,
"rewards/format_reward_long_step": 0.98828125,
"rewards/stepwise_brier_reward_long_step": 0.6862928867340088,
"step": 68
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 652.0,
"completions/max_terminated_length": 652.0,
"completions/mean_length": 239.89453125,
"completions/mean_terminated_length": 239.89453125,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.1104,
"grad_norm": 0.033029954880476,
"learning_rate": 7.277777777777777e-07,
"loss": 0.0144,
"num_tokens": 34381011.0,
"reward": 1.3730568885803223,
"reward_std": 0.26877105236053467,
"rewards/accuracy_reward_long_step": 0.52734375,
"rewards/final_brier_reward_long_step": 0.6850621700286865,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7056032419204712,
"step": 69
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 778.0,
"completions/max_terminated_length": 778.0,
"completions/mean_length": 237.265625,
"completions/mean_terminated_length": 237.265625,
"completions/min_length": 109.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.112,
"grad_norm": 0.09013690799474716,
"learning_rate": 7.222222222222221e-07,
"loss": 0.0035,
"num_tokens": 34869191.0,
"reward": 1.251227855682373,
"reward_std": 0.19920992851257324,
"rewards/accuracy_reward_long_step": 0.37890625,
"rewards/final_brier_reward_long_step": 0.7329218983650208,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7641769051551819,
"step": 70
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 717.0,
"completions/max_terminated_length": 717.0,
"completions/mean_length": 229.0859375,
"completions/mean_terminated_length": 229.0859375,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.1136,
"grad_norm": 0.03403422236442566,
"learning_rate": 7.166666666666667e-07,
"loss": -0.0026,
"num_tokens": 35346397.0,
"reward": 1.293338656425476,
"reward_std": 0.3115568161010742,
"rewards/accuracy_reward_long_step": 0.44921875,
"rewards/final_brier_reward_long_step": 0.710399866104126,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.6817047595977783,
"step": 71
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 537.0,
"completions/max_terminated_length": 537.0,
"completions/mean_length": 225.59765625,
"completions/mean_terminated_length": 226.48236083984375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.1152,
"grad_norm": 0.04565083235502243,
"learning_rate": 7.111111111111111e-07,
"loss": -0.0086,
"num_tokens": 35817766.0,
"reward": 1.199450969696045,
"reward_std": 0.1990368813276291,
"rewards/accuracy_reward_long_step": 0.32421875,
"rewards/final_brier_reward_long_step": 0.762933611869812,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7458075284957886,
"step": 72
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 773.0,
"completions/max_terminated_length": 773.0,
"completions/mean_length": 237.078125,
"completions/mean_terminated_length": 238.0078582763672,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.1168,
"grad_norm": 0.16507937014102936,
"learning_rate": 7.055555555555556e-07,
"loss": 0.0011,
"num_tokens": 36306426.0,
"reward": 1.1114095449447632,
"reward_std": 0.23765724897384644,
"rewards/accuracy_reward_long_step": 0.2421875,
"rewards/final_brier_reward_long_step": 0.7882089614868164,
"rewards/format_reward_long_step": 0.96875,
"rewards/stepwise_brier_reward_long_step": 0.7511793971061707,
"step": 73
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 572.0,
"completions/max_terminated_length": 572.0,
"completions/mean_length": 222.734375,
"completions/mean_terminated_length": 222.734375,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.1184,
"grad_norm": 0.10383451730012894,
"learning_rate": 7e-07,
"loss": -0.0018,
"num_tokens": 36773214.0,
"reward": 1.2845209836959839,
"reward_std": 0.2531934976577759,
"rewards/accuracy_reward_long_step": 0.43359375,
"rewards/final_brier_reward_long_step": 0.700976550579071,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.710544764995575,
"step": 74
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 518.0,
"completions/max_terminated_length": 518.0,
"completions/mean_length": 227.99609375,
"completions/mean_terminated_length": 227.99609375,
"completions/min_length": 94.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.12,
"grad_norm": 0.04366622865200043,
"learning_rate": 6.944444444444444e-07,
"loss": -0.0197,
"num_tokens": 37265613.0,
"reward": 1.2163734436035156,
"reward_std": 0.20943868160247803,
"rewards/accuracy_reward_long_step": 0.359375,
"rewards/final_brier_reward_long_step": 0.7348078489303589,
"rewards/format_reward_long_step": 0.98828125,
"rewards/stepwise_brier_reward_long_step": 0.7166235446929932,
"step": 75
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 611.0,
"completions/max_terminated_length": 611.0,
"completions/mean_length": 225.1484375,
"completions/mean_terminated_length": 226.03138732910156,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.1216,
"grad_norm": 0.06976811587810516,
"learning_rate": 6.888888888888889e-07,
"loss": -0.0152,
"num_tokens": 37743499.0,
"reward": 1.2767009735107422,
"reward_std": 0.16413617134094238,
"rewards/accuracy_reward_long_step": 0.4140625,
"rewards/final_brier_reward_long_step": 0.7078866958618164,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7504795789718628,
"step": 76
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 527.0,
"completions/max_terminated_length": 527.0,
"completions/mean_length": 231.44921875,
"completions/mean_terminated_length": 231.44921875,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.1232,
"grad_norm": 0.04065747186541557,
"learning_rate": 6.833333333333333e-07,
"loss": 0.0018,
"num_tokens": 38227782.0,
"reward": 1.2329548597335815,
"reward_std": 0.1812242567539215,
"rewards/accuracy_reward_long_step": 0.37109375,
"rewards/final_brier_reward_long_step": 0.7489936351776123,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.6984509229660034,
"step": 77
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 562.0,
"completions/max_terminated_length": 562.0,
"completions/mean_length": 227.01171875,
"completions/mean_terminated_length": 227.01171875,
"completions/min_length": 111.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.1248,
"grad_norm": 0.03462184593081474,
"learning_rate": 6.777777777777778e-07,
"loss": 0.0084,
"num_tokens": 38706801.0,
"reward": 1.2074387073516846,
"reward_std": 0.23012541234493256,
"rewards/accuracy_reward_long_step": 0.35546875,
"rewards/final_brier_reward_long_step": 0.7311980724334717,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.6923068761825562,
"step": 78
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 601.0,
"completions/max_terminated_length": 601.0,
"completions/mean_length": 214.515625,
"completions/mean_terminated_length": 214.515625,
"completions/min_length": 95.0,
"completions/min_terminated_length": 95.0,
"epoch": 0.1264,
"grad_norm": 0.06125793233513832,
"learning_rate": 6.722222222222222e-07,
"loss": 0.0054,
"num_tokens": 39185933.0,
"reward": 1.202413558959961,
"reward_std": 0.1790447235107422,
"rewards/accuracy_reward_long_step": 0.33203125,
"rewards/final_brier_reward_long_step": 0.7572082281112671,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7243211269378662,
"step": 79
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 461.0,
"completions/max_terminated_length": 461.0,
"completions/mean_length": 209.36328125,
"completions/mean_terminated_length": 209.36328125,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.128,
"grad_norm": 0.0341668576002121,
"learning_rate": 6.666666666666666e-07,
"loss": 0.005,
"num_tokens": 39665874.0,
"reward": 1.1575262546539307,
"reward_std": 0.19264450669288635,
"rewards/accuracy_reward_long_step": 0.2734375,
"rewards/final_brier_reward_long_step": 0.7840448617935181,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7679354548454285,
"step": 80
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 578.0,
"completions/max_terminated_length": 578.0,
"completions/mean_length": 218.9296875,
"completions/mean_terminated_length": 218.9296875,
"completions/min_length": 93.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.1296,
"grad_norm": 0.056944340467453,
"learning_rate": 6.611111111111111e-07,
"loss": -0.0004,
"num_tokens": 40145992.0,
"reward": 1.2321239709854126,
"reward_std": 0.288753867149353,
"rewards/accuracy_reward_long_step": 0.36328125,
"rewards/final_brier_reward_long_step": 0.7278487682342529,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.755334734916687,
"step": 81
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 469.0,
"completions/max_terminated_length": 469.0,
"completions/mean_length": 216.609375,
"completions/mean_terminated_length": 217.45883178710938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.1312,
"grad_norm": 0.040777526795864105,
"learning_rate": 6.555555555555555e-07,
"loss": -0.0166,
"num_tokens": 40628988.0,
"reward": 1.2330138683319092,
"reward_std": 0.2308121919631958,
"rewards/accuracy_reward_long_step": 0.37109375,
"rewards/final_brier_reward_long_step": 0.7352034449577332,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7202898263931274,
"step": 82
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 859.0,
"completions/max_terminated_length": 859.0,
"completions/mean_length": 220.13671875,
"completions/mean_terminated_length": 220.13671875,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.1328,
"grad_norm": 0.2267780750989914,
"learning_rate": 6.5e-07,
"loss": -0.0127,
"num_tokens": 41107423.0,
"reward": 1.2562687397003174,
"reward_std": 0.2288559228181839,
"rewards/accuracy_reward_long_step": 0.3984375,
"rewards/final_brier_reward_long_step": 0.6950433254241943,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7362817525863647,
"step": 83
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 631.0,
"completions/max_terminated_length": 631.0,
"completions/mean_length": 225.0703125,
"completions/mean_terminated_length": 225.0703125,
"completions/min_length": 114.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.1344,
"grad_norm": 0.04118198901414871,
"learning_rate": 6.444444444444444e-07,
"loss": 0.0058,
"num_tokens": 41599321.0,
"reward": 1.3327488899230957,
"reward_std": 0.1981910765171051,
"rewards/accuracy_reward_long_step": 0.48828125,
"rewards/final_brier_reward_long_step": 0.6561777591705322,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7295053005218506,
"step": 84
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 499.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 219.65625,
"completions/mean_terminated_length": 219.65625,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.136,
"grad_norm": 0.04954572767019272,
"learning_rate": 6.388888888888888e-07,
"loss": 0.0006,
"num_tokens": 42074473.0,
"reward": 1.3892377614974976,
"reward_std": 0.1905529648065567,
"rewards/accuracy_reward_long_step": 0.5546875,
"rewards/final_brier_reward_long_step": 0.6077094078063965,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7383042573928833,
"step": 85
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 211.30078125,
"completions/mean_terminated_length": 211.30078125,
"completions/min_length": 98.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.1376,
"grad_norm": 0.04808489605784416,
"learning_rate": 6.333333333333332e-07,
"loss": -0.0179,
"num_tokens": 42543374.0,
"reward": 1.3056892156600952,
"reward_std": 0.18410624563694,
"rewards/accuracy_reward_long_step": 0.4453125,
"rewards/final_brier_reward_long_step": 0.670364260673523,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7711423635482788,
"step": 86
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 545.0,
"completions/max_terminated_length": 545.0,
"completions/mean_length": 216.82421875,
"completions/mean_terminated_length": 217.67453002929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.1392,
"grad_norm": 0.07117484509944916,
"learning_rate": 6.277777777777777e-07,
"loss": -0.0033,
"num_tokens": 43021761.0,
"reward": 1.3590378761291504,
"reward_std": 0.20214568078517914,
"rewards/accuracy_reward_long_step": 0.5390625,
"rewards/final_brier_reward_long_step": 0.604397177696228,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.6833170652389526,
"step": 87
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 584.0,
"completions/max_terminated_length": 584.0,
"completions/mean_length": 232.453125,
"completions/mean_terminated_length": 232.453125,
"completions/min_length": 109.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.1408,
"grad_norm": 0.05832771211862564,
"learning_rate": 6.222222222222223e-07,
"loss": 0.0138,
"num_tokens": 43497541.0,
"reward": 1.3088263273239136,
"reward_std": 0.17070481181144714,
"rewards/accuracy_reward_long_step": 0.4375,
"rewards/final_brier_reward_long_step": 0.6950075626373291,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.79029780626297,
"step": 88
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 770.0,
"completions/max_terminated_length": 770.0,
"completions/mean_length": 224.8828125,
"completions/mean_terminated_length": 224.8828125,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.1424,
"grad_norm": 0.031236618757247925,
"learning_rate": 6.166666666666667e-07,
"loss": -0.0017,
"num_tokens": 43984239.0,
"reward": 1.2366325855255127,
"reward_std": 0.22545480728149414,
"rewards/accuracy_reward_long_step": 0.36328125,
"rewards/final_brier_reward_long_step": 0.7334580421447754,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7599474191665649,
"step": 89
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 531.0,
"completions/max_terminated_length": 531.0,
"completions/mean_length": 220.33203125,
"completions/mean_terminated_length": 220.33203125,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.144,
"grad_norm": 0.032982852309942245,
"learning_rate": 6.111111111111112e-07,
"loss": 0.0033,
"num_tokens": 44459788.0,
"reward": 1.2820594310760498,
"reward_std": 0.24406251311302185,
"rewards/accuracy_reward_long_step": 0.43359375,
"rewards/final_brier_reward_long_step": 0.6791086792945862,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7303793430328369,
"step": 90
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 521.0,
"completions/max_terminated_length": 521.0,
"completions/mean_length": 224.7734375,
"completions/mean_terminated_length": 224.7734375,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.1456,
"grad_norm": 0.04948606342077255,
"learning_rate": 6.055555555555555e-07,
"loss": -0.0038,
"num_tokens": 44927786.0,
"reward": 1.312872290611267,
"reward_std": 0.21360260248184204,
"rewards/accuracy_reward_long_step": 0.44921875,
"rewards/final_brier_reward_long_step": 0.681350588798523,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7732632756233215,
"step": 91
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 567.0,
"completions/max_terminated_length": 567.0,
"completions/mean_length": 221.44140625,
"completions/mean_terminated_length": 221.44140625,
"completions/min_length": 106.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.1472,
"grad_norm": 0.06898247450590134,
"learning_rate": 6e-07,
"loss": -0.0107,
"num_tokens": 45407147.0,
"reward": 1.3344571590423584,
"reward_std": 0.21049799025058746,
"rewards/accuracy_reward_long_step": 0.484375,
"rewards/final_brier_reward_long_step": 0.6809437870979309,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.719385027885437,
"step": 92
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 508.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 233.40234375,
"completions/mean_terminated_length": 233.40234375,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.1488,
"grad_norm": 0.034343279898166656,
"learning_rate": 5.944444444444444e-07,
"loss": -0.0167,
"num_tokens": 45889994.0,
"reward": 1.250670075416565,
"reward_std": 0.20306074619293213,
"rewards/accuracy_reward_long_step": 0.3828125,
"rewards/final_brier_reward_long_step": 0.723064661026001,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7561779022216797,
"step": 93
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 593.0,
"completions/max_terminated_length": 593.0,
"completions/mean_length": 239.5859375,
"completions/mean_terminated_length": 239.5859375,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.1504,
"grad_norm": 0.03516862913966179,
"learning_rate": 5.888888888888889e-07,
"loss": -0.0025,
"num_tokens": 46369656.0,
"reward": 1.3316434621810913,
"reward_std": 0.2240450382232666,
"rewards/accuracy_reward_long_step": 0.46484375,
"rewards/final_brier_reward_long_step": 0.6970722675323486,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7701265811920166,
"step": 94
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 536.0,
"completions/max_terminated_length": 536.0,
"completions/mean_length": 233.03125,
"completions/mean_terminated_length": 233.03125,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.152,
"grad_norm": 0.04016004502773285,
"learning_rate": 5.833333333333334e-07,
"loss": 0.0016,
"num_tokens": 46861144.0,
"reward": 1.2625809907913208,
"reward_std": 0.21456453204154968,
"rewards/accuracy_reward_long_step": 0.39453125,
"rewards/final_brier_reward_long_step": 0.7184839248657227,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7693402767181396,
"step": 95
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 721.0,
"completions/max_terminated_length": 721.0,
"completions/mean_length": 241.79296875,
"completions/mean_terminated_length": 241.79296875,
"completions/min_length": 116.0,
"completions/min_terminated_length": 116.0,
"epoch": 0.1536,
"grad_norm": 0.03148433566093445,
"learning_rate": 5.777777777777777e-07,
"loss": 0.0126,
"num_tokens": 47360659.0,
"reward": 1.303478479385376,
"reward_std": 0.18032774329185486,
"rewards/accuracy_reward_long_step": 0.44140625,
"rewards/final_brier_reward_long_step": 0.7018147706985474,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7464738488197327,
"step": 96
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 761.0,
"completions/max_terminated_length": 761.0,
"completions/mean_length": 241.2734375,
"completions/mean_terminated_length": 241.2734375,
"completions/min_length": 118.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.1552,
"grad_norm": 0.03487463667988777,
"learning_rate": 5.722222222222222e-07,
"loss": 0.0105,
"num_tokens": 47839465.0,
"reward": 1.3190314769744873,
"reward_std": 0.18113639950752258,
"rewards/accuracy_reward_long_step": 0.4453125,
"rewards/final_brier_reward_long_step": 0.718437910079956,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7764377593994141,
"step": 97
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 614.0,
"completions/max_terminated_length": 614.0,
"completions/mean_length": 244.4296875,
"completions/mean_terminated_length": 244.4296875,
"completions/min_length": 114.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.1568,
"grad_norm": 0.06506786495447159,
"learning_rate": 5.666666666666666e-07,
"loss": 0.0106,
"num_tokens": 48313367.0,
"reward": 1.214963436126709,
"reward_std": 0.18929462134838104,
"rewards/accuracy_reward_long_step": 0.3203125,
"rewards/final_brier_reward_long_step": 0.7821929454803467,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7964109182357788,
"step": 98
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 641.0,
"completions/max_terminated_length": 641.0,
"completions/mean_length": 244.140625,
"completions/mean_terminated_length": 244.140625,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.1584,
"grad_norm": 0.032129108905792236,
"learning_rate": 5.611111111111111e-07,
"loss": 0.0066,
"num_tokens": 48807115.0,
"reward": 1.2324862480163574,
"reward_std": 0.1833975911140442,
"rewards/accuracy_reward_long_step": 0.33984375,
"rewards/final_brier_reward_long_step": 0.7741453051567078,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.8042370080947876,
"step": 99
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 605.0,
"completions/max_terminated_length": 605.0,
"completions/mean_length": 248.4140625,
"completions/mean_terminated_length": 248.4140625,
"completions/min_length": 99.0,
"completions/min_terminated_length": 99.0,
"epoch": 0.16,
"grad_norm": 0.041153669357299805,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0039,
"num_tokens": 49313845.0,
"reward": 1.3223180770874023,
"reward_std": 0.2212940752506256,
"rewards/accuracy_reward_long_step": 0.46484375,
"rewards/final_brier_reward_long_step": 0.6976035237312317,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7479186058044434,
"step": 100
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 477.0,
"completions/max_terminated_length": 477.0,
"completions/mean_length": 235.953125,
"completions/mean_terminated_length": 236.87844848632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.1616,
"grad_norm": 0.05703693628311157,
"learning_rate": 5.5e-07,
"loss": -0.0125,
"num_tokens": 49808721.0,
"reward": 1.289008378982544,
"reward_std": 0.11740753054618835,
"rewards/accuracy_reward_long_step": 0.40625,
"rewards/final_brier_reward_long_step": 0.7392846345901489,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7995613813400269,
"step": 101
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 544.0,
"completions/max_terminated_length": 544.0,
"completions/mean_length": 242.234375,
"completions/mean_terminated_length": 242.234375,
"completions/min_length": 122.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.1632,
"grad_norm": 0.04496127367019653,
"learning_rate": 5.444444444444443e-07,
"loss": 0.0125,
"num_tokens": 50293829.0,
"reward": 1.2432105541229248,
"reward_std": 0.18485994637012482,
"rewards/accuracy_reward_long_step": 0.35546875,
"rewards/final_brier_reward_long_step": 0.7693418264389038,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7816250920295715,
"step": 102
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 557.0,
"completions/max_terminated_length": 557.0,
"completions/mean_length": 245.44921875,
"completions/mean_terminated_length": 245.44921875,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.1648,
"grad_norm": 0.035261496901512146,
"learning_rate": 5.388888888888888e-07,
"loss": -0.003,
"num_tokens": 50795808.0,
"reward": 1.312826156616211,
"reward_std": 0.2417442500591278,
"rewards/accuracy_reward_long_step": 0.44140625,
"rewards/final_brier_reward_long_step": 0.709702730178833,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.775976836681366,
"step": 103
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 631.0,
"completions/max_terminated_length": 631.0,
"completions/mean_length": 245.59765625,
"completions/mean_terminated_length": 245.59765625,
"completions/min_length": 133.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.1664,
"grad_norm": 0.032495662569999695,
"learning_rate": 5.333333333333333e-07,
"loss": -0.0091,
"num_tokens": 51279649.0,
"reward": 1.315579891204834,
"reward_std": 0.267301082611084,
"rewards/accuracy_reward_long_step": 0.4453125,
"rewards/final_brier_reward_long_step": 0.7196257710456848,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7614439725875854,
"step": 104
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 680.0,
"completions/max_terminated_length": 680.0,
"completions/mean_length": 252.44140625,
"completions/mean_terminated_length": 252.44140625,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.168,
"grad_norm": 0.03773434832692146,
"learning_rate": 5.277777777777777e-07,
"loss": 0.0192,
"num_tokens": 51778938.0,
"reward": 1.3275742530822754,
"reward_std": 0.2403724491596222,
"rewards/accuracy_reward_long_step": 0.46484375,
"rewards/final_brier_reward_long_step": 0.707565188407898,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7589816451072693,
"step": 105
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 700.0,
"completions/max_terminated_length": 700.0,
"completions/mean_length": 267.77734375,
"completions/mean_terminated_length": 267.77734375,
"completions/min_length": 126.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.1696,
"grad_norm": 0.032062213867902756,
"learning_rate": 5.222222222222223e-07,
"loss": -0.001,
"num_tokens": 52287993.0,
"reward": 1.2254037857055664,
"reward_std": 0.22276735305786133,
"rewards/accuracy_reward_long_step": 0.3515625,
"rewards/final_brier_reward_long_step": 0.7529284954071045,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7424367070198059,
"step": 106
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 666.0,
"completions/max_terminated_length": 666.0,
"completions/mean_length": 252.94140625,
"completions/mean_terminated_length": 252.94140625,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.1712,
"grad_norm": 0.06173327565193176,
"learning_rate": 5.166666666666667e-07,
"loss": -0.002,
"num_tokens": 52788314.0,
"reward": 1.2225637435913086,
"reward_std": 0.19366461038589478,
"rewards/accuracy_reward_long_step": 0.33203125,
"rewards/final_brier_reward_long_step": 0.7824580669403076,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7796720266342163,
"step": 107
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 606.0,
"completions/max_terminated_length": 606.0,
"completions/mean_length": 264.60546875,
"completions/mean_terminated_length": 264.60546875,
"completions/min_length": 122.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.1728,
"grad_norm": 0.034810252487659454,
"learning_rate": 5.111111111111111e-07,
"loss": -0.004,
"num_tokens": 53282845.0,
"reward": 1.2544004917144775,
"reward_std": 0.1617870330810547,
"rewards/accuracy_reward_long_step": 0.37890625,
"rewards/final_brier_reward_long_step": 0.7480487823486328,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7539278864860535,
"step": 108
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 662.0,
"completions/max_terminated_length": 662.0,
"completions/mean_length": 261.3046875,
"completions/mean_terminated_length": 261.3046875,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.1744,
"grad_norm": 0.15722031891345978,
"learning_rate": 5.055555555555555e-07,
"loss": -0.0081,
"num_tokens": 53767051.0,
"reward": 1.322205901145935,
"reward_std": 0.19752384722232819,
"rewards/accuracy_reward_long_step": 0.453125,
"rewards/final_brier_reward_long_step": 0.7100498676300049,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7662734389305115,
"step": 109
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 572.0,
"completions/max_terminated_length": 572.0,
"completions/mean_length": 266.1015625,
"completions/mean_terminated_length": 266.1015625,
"completions/min_length": 112.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.176,
"grad_norm": 0.03229089081287384,
"learning_rate": 5e-07,
"loss": -0.0056,
"num_tokens": 54263453.0,
"reward": 1.3309049606323242,
"reward_std": 0.25080251693725586,
"rewards/accuracy_reward_long_step": 0.45703125,
"rewards/final_brier_reward_long_step": 0.713943362236023,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7815513610839844,
"step": 110
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 566.0,
"completions/max_terminated_length": 566.0,
"completions/mean_length": 265.24609375,
"completions/mean_terminated_length": 265.24609375,
"completions/min_length": 122.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.1776,
"grad_norm": 0.04050762951374054,
"learning_rate": 4.944444444444445e-07,
"loss": -0.0176,
"num_tokens": 54756988.0,
"reward": 1.2833229303359985,
"reward_std": 0.1913967728614807,
"rewards/accuracy_reward_long_step": 0.3984375,
"rewards/final_brier_reward_long_step": 0.7617863416671753,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7777553796768188,
"step": 111
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 576.0,
"completions/max_terminated_length": 576.0,
"completions/mean_length": 252.6796875,
"completions/mean_terminated_length": 252.6796875,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.1792,
"grad_norm": 0.05397975817322731,
"learning_rate": 4.888888888888889e-07,
"loss": -0.0078,
"num_tokens": 55254026.0,
"reward": 1.2269587516784668,
"reward_std": 0.13054987788200378,
"rewards/accuracy_reward_long_step": 0.3359375,
"rewards/final_brier_reward_long_step": 0.7798945903778076,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7841900587081909,
"step": 112
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 533.0,
"completions/max_terminated_length": 533.0,
"completions/mean_length": 256.3671875,
"completions/mean_terminated_length": 256.3671875,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.1808,
"grad_norm": 0.12474508583545685,
"learning_rate": 4.833333333333333e-07,
"loss": -0.009,
"num_tokens": 55726928.0,
"reward": 1.36446213722229,
"reward_std": 0.2588464617729187,
"rewards/accuracy_reward_long_step": 0.49609375,
"rewards/final_brier_reward_long_step": 0.6960980892181396,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7773758172988892,
"step": 113
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 663.0,
"completions/max_terminated_length": 663.0,
"completions/mean_length": 267.078125,
"completions/mean_terminated_length": 267.078125,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.1824,
"grad_norm": 0.05340947210788727,
"learning_rate": 4.777777777777778e-07,
"loss": 0.0001,
"num_tokens": 56212564.0,
"reward": 1.3508033752441406,
"reward_std": 0.174399271607399,
"rewards/accuracy_reward_long_step": 0.47265625,
"rewards/final_brier_reward_long_step": 0.7279148101806641,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.784673810005188,
"step": 114
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 707.0,
"completions/max_terminated_length": 707.0,
"completions/mean_length": 258.515625,
"completions/mean_terminated_length": 258.515625,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.184,
"grad_norm": 0.051825981587171555,
"learning_rate": 4.722222222222222e-07,
"loss": 0.0029,
"num_tokens": 56705280.0,
"reward": 1.37041175365448,
"reward_std": 0.1563851535320282,
"rewards/accuracy_reward_long_step": 0.5,
"rewards/final_brier_reward_long_step": 0.7170413732528687,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7646055221557617,
"step": 115
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 502.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 257.93359375,
"completions/mean_terminated_length": 258.9450988769531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.1856,
"grad_norm": 0.056365448981523514,
"learning_rate": 4.6666666666666666e-07,
"loss": -0.0229,
"num_tokens": 57197543.0,
"reward": 1.3457661867141724,
"reward_std": 0.25224822759628296,
"rewards/accuracy_reward_long_step": 0.4765625,
"rewards/final_brier_reward_long_step": 0.7099324464797974,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7746949195861816,
"step": 116
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 612.0,
"completions/max_terminated_length": 612.0,
"completions/mean_length": 272.43359375,
"completions/mean_terminated_length": 272.43359375,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.1872,
"grad_norm": 0.042497143149375916,
"learning_rate": 4.611111111111111e-07,
"loss": -0.0156,
"num_tokens": 57681430.0,
"reward": 1.3612632751464844,
"reward_std": 0.25810113549232483,
"rewards/accuracy_reward_long_step": 0.49609375,
"rewards/final_brier_reward_long_step": 0.6923613548278809,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7761292457580566,
"step": 117
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 576.0,
"completions/max_terminated_length": 576.0,
"completions/mean_length": 268.60546875,
"completions/mean_terminated_length": 268.60546875,
"completions/min_length": 81.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.1888,
"grad_norm": 0.04082271829247475,
"learning_rate": 4.555555555555555e-07,
"loss": 0.0062,
"num_tokens": 58166401.0,
"reward": 1.3085888624191284,
"reward_std": 0.18546397984027863,
"rewards/accuracy_reward_long_step": 0.44140625,
"rewards/final_brier_reward_long_step": 0.7159848213195801,
"rewards/format_reward_long_step": 0.98828125,
"rewards/stepwise_brier_reward_long_step": 0.7761832475662231,
"step": 118
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 515.0,
"completions/max_terminated_length": 515.0,
"completions/mean_length": 276.85546875,
"completions/mean_terminated_length": 276.85546875,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.1904,
"grad_norm": 0.031454987823963165,
"learning_rate": 4.5e-07,
"loss": 0.0065,
"num_tokens": 58658684.0,
"reward": 1.2189127206802368,
"reward_std": 0.14538250863552094,
"rewards/accuracy_reward_long_step": 0.32421875,
"rewards/final_brier_reward_long_step": 0.7811195254325867,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7976564168930054,
"step": 119
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 522.0,
"completions/max_terminated_length": 522.0,
"completions/mean_length": 278.578125,
"completions/mean_terminated_length": 278.578125,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.192,
"grad_norm": 0.031778957694768906,
"learning_rate": 4.444444444444444e-07,
"loss": -0.0123,
"num_tokens": 59138176.0,
"reward": 1.3575904369354248,
"reward_std": 0.16297504305839539,
"rewards/accuracy_reward_long_step": 0.48046875,
"rewards/final_brier_reward_long_step": 0.7171749472618103,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7913117408752441,
"step": 120
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 647.0,
"completions/max_terminated_length": 647.0,
"completions/mean_length": 284.1015625,
"completions/mean_terminated_length": 284.1015625,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.1936,
"grad_norm": 0.046684540808200836,
"learning_rate": 4.3888888888888884e-07,
"loss": -0.0039,
"num_tokens": 59613906.0,
"reward": 1.4500417709350586,
"reward_std": 0.21346309781074524,
"rewards/accuracy_reward_long_step": 0.609375,
"rewards/final_brier_reward_long_step": 0.6556953191757202,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7225968837738037,
"step": 121
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 666.0,
"completions/max_terminated_length": 666.0,
"completions/mean_length": 290.79296875,
"completions/mean_terminated_length": 290.79296875,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.1952,
"grad_norm": 0.03638645261526108,
"learning_rate": 4.3333333333333335e-07,
"loss": 0.0073,
"num_tokens": 60115509.0,
"reward": 1.2977830171585083,
"reward_std": 0.22426971793174744,
"rewards/accuracy_reward_long_step": 0.4140625,
"rewards/final_brier_reward_long_step": 0.7605781555175781,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7821164131164551,
"step": 122
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 612.0,
"completions/max_terminated_length": 612.0,
"completions/mean_length": 276.171875,
"completions/mean_terminated_length": 276.171875,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.1968,
"grad_norm": 0.0302373468875885,
"learning_rate": 4.2777777777777775e-07,
"loss": -0.0006,
"num_tokens": 60614385.0,
"reward": 1.415604829788208,
"reward_std": 0.17988049983978271,
"rewards/accuracy_reward_long_step": 0.55078125,
"rewards/final_brier_reward_long_step": 0.6876656413078308,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7794409990310669,
"step": 123
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 559.0,
"completions/max_terminated_length": 559.0,
"completions/mean_length": 287.3828125,
"completions/mean_terminated_length": 287.3828125,
"completions/min_length": 149.0,
"completions/min_terminated_length": 149.0,
"epoch": 0.1984,
"grad_norm": 0.05541790649294853,
"learning_rate": 4.222222222222222e-07,
"loss": 0.003,
"num_tokens": 61111347.0,
"reward": 1.2961609363555908,
"reward_std": 0.21454349160194397,
"rewards/accuracy_reward_long_step": 0.41015625,
"rewards/final_brier_reward_long_step": 0.7439448833465576,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.8078866004943848,
"step": 124
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 710.0,
"completions/max_terminated_length": 710.0,
"completions/mean_length": 294.57421875,
"completions/mean_terminated_length": 294.57421875,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.2,
"grad_norm": 0.028944578021764755,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.017,
"num_tokens": 61617174.0,
"reward": 1.35416841506958,
"reward_std": 0.1785564422607422,
"rewards/accuracy_reward_long_step": 0.4765625,
"rewards/final_brier_reward_long_step": 0.7367419004440308,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7736819982528687,
"step": 125
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 641.0,
"completions/max_terminated_length": 641.0,
"completions/mean_length": 284.69921875,
"completions/mean_terminated_length": 284.69921875,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.2016,
"grad_norm": 0.06281944364309311,
"learning_rate": 4.1111111111111107e-07,
"loss": 0.0122,
"num_tokens": 62122729.0,
"reward": 1.3892717361450195,
"reward_std": 0.2381824553012848,
"rewards/accuracy_reward_long_step": 0.51171875,
"rewards/final_brier_reward_long_step": 0.7210512161254883,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7969735860824585,
"step": 126
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 565.0,
"completions/max_terminated_length": 565.0,
"completions/mean_length": 285.65625,
"completions/mean_terminated_length": 285.65625,
"completions/min_length": 121.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.2032,
"grad_norm": 0.04049834981560707,
"learning_rate": 4.055555555555555e-07,
"loss": 0.0137,
"num_tokens": 62628897.0,
"reward": 1.3196742534637451,
"reward_std": 0.1803930401802063,
"rewards/accuracy_reward_long_step": 0.43359375,
"rewards/final_brier_reward_long_step": 0.7413291931152344,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8029927015304565,
"step": 127
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 725.0,
"completions/max_terminated_length": 725.0,
"completions/mean_length": 289.6796875,
"completions/mean_terminated_length": 289.6796875,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.2048,
"grad_norm": 0.045730676501989365,
"learning_rate": 4e-07,
"loss": 0.0021,
"num_tokens": 63135039.0,
"reward": 1.3337769508361816,
"reward_std": 0.22858937084674835,
"rewards/accuracy_reward_long_step": 0.45703125,
"rewards/final_brier_reward_long_step": 0.7338937520980835,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7730889320373535,
"step": 128
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 728.0,
"completions/max_terminated_length": 728.0,
"completions/mean_length": 287.39453125,
"completions/mean_terminated_length": 287.39453125,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.2064,
"grad_norm": 0.027239752933382988,
"learning_rate": 3.9444444444444444e-07,
"loss": -0.0029,
"num_tokens": 63639908.0,
"reward": 1.3897987604141235,
"reward_std": 0.2298661321401596,
"rewards/accuracy_reward_long_step": 0.515625,
"rewards/final_brier_reward_long_step": 0.6942844390869141,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.8102232217788696,
"step": 129
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 643.0,
"completions/max_terminated_length": 643.0,
"completions/mean_length": 295.29296875,
"completions/mean_terminated_length": 295.29296875,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.208,
"grad_norm": 0.030676817521452904,
"learning_rate": 3.888888888888889e-07,
"loss": -0.0058,
"num_tokens": 64123943.0,
"reward": 1.3153623342514038,
"reward_std": 0.15915831923484802,
"rewards/accuracy_reward_long_step": 0.42578125,
"rewards/final_brier_reward_long_step": 0.7673367261886597,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7909875512123108,
"step": 130
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 651.0,
"completions/max_terminated_length": 651.0,
"completions/mean_length": 297.234375,
"completions/mean_terminated_length": 297.234375,
"completions/min_length": 126.0,
"completions/min_terminated_length": 126.0,
"epoch": 0.2096,
"grad_norm": 0.06891848146915436,
"learning_rate": 3.8333333333333335e-07,
"loss": 0.0102,
"num_tokens": 64630875.0,
"reward": 1.3944809436798096,
"reward_std": 0.14757326245307922,
"rewards/accuracy_reward_long_step": 0.51953125,
"rewards/final_brier_reward_long_step": 0.7101758122444153,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7896230220794678,
"step": 131
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 667.0,
"completions/max_terminated_length": 667.0,
"completions/mean_length": 302.23046875,
"completions/mean_terminated_length": 302.23046875,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.2112,
"grad_norm": 0.09698417037725449,
"learning_rate": 3.7777777777777775e-07,
"loss": -0.0078,
"num_tokens": 65137038.0,
"reward": 1.2600277662277222,
"reward_std": 0.17520007491111755,
"rewards/accuracy_reward_long_step": 0.36328125,
"rewards/final_brier_reward_long_step": 0.7893859148025513,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7976003885269165,
"step": 132
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 664.0,
"completions/max_terminated_length": 664.0,
"completions/mean_length": 296.4375,
"completions/mean_terminated_length": 296.4375,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.2128,
"grad_norm": 0.03775089234113693,
"learning_rate": 3.722222222222222e-07,
"loss": 0.0009,
"num_tokens": 65633878.0,
"reward": 1.2865660190582275,
"reward_std": 0.23296663165092468,
"rewards/accuracy_reward_long_step": 0.40234375,
"rewards/final_brier_reward_long_step": 0.7601765394210815,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7767128944396973,
"step": 133
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 610.0,
"completions/max_terminated_length": 610.0,
"completions/mean_length": 290.83203125,
"completions/mean_terminated_length": 290.83203125,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.2144,
"grad_norm": 0.030530598014593124,
"learning_rate": 3.666666666666666e-07,
"loss": 0.01,
"num_tokens": 66127099.0,
"reward": 1.2935059070587158,
"reward_std": 0.20468412339687347,
"rewards/accuracy_reward_long_step": 0.41015625,
"rewards/final_brier_reward_long_step": 0.7623168230056763,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7788943648338318,
"step": 134
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 701.0,
"completions/max_terminated_length": 701.0,
"completions/mean_length": 310.86328125,
"completions/mean_terminated_length": 310.86328125,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.216,
"grad_norm": 0.02592829428613186,
"learning_rate": 3.6111111111111107e-07,
"loss": -0.0094,
"num_tokens": 66623992.0,
"reward": 1.4253921508789062,
"reward_std": 0.12001308053731918,
"rewards/accuracy_reward_long_step": 0.55078125,
"rewards/final_brier_reward_long_step": 0.7179234027862549,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7805203795433044,
"step": 135
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 768.0,
"completions/max_terminated_length": 768.0,
"completions/mean_length": 316.38671875,
"completions/mean_terminated_length": 317.6274719238281,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.2176,
"grad_norm": 0.031040871515870094,
"learning_rate": 3.5555555555555553e-07,
"loss": -0.0145,
"num_tokens": 67146987.0,
"reward": 1.2932628393173218,
"reward_std": 0.1893351525068283,
"rewards/accuracy_reward_long_step": 0.40625,
"rewards/final_brier_reward_long_step": 0.7708644866943359,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7928118109703064,
"step": 136
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 623.0,
"completions/max_terminated_length": 623.0,
"completions/mean_length": 298.63671875,
"completions/mean_terminated_length": 298.63671875,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.2192,
"grad_norm": 0.1331356018781662,
"learning_rate": 3.5e-07,
"loss": 0.0031,
"num_tokens": 67653382.0,
"reward": 1.4031734466552734,
"reward_std": 0.19260135293006897,
"rewards/accuracy_reward_long_step": 0.51953125,
"rewards/final_brier_reward_long_step": 0.7188730239868164,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8156960010528564,
"step": 137
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 573.0,
"completions/max_terminated_length": 573.0,
"completions/mean_length": 304.4140625,
"completions/mean_terminated_length": 304.4140625,
"completions/min_length": 139.0,
"completions/min_terminated_length": 139.0,
"epoch": 0.2208,
"grad_norm": 0.030995018780231476,
"learning_rate": 3.4444444444444444e-07,
"loss": -0.0093,
"num_tokens": 68164376.0,
"reward": 1.3275476694107056,
"reward_std": 0.18350914120674133,
"rewards/accuracy_reward_long_step": 0.4296875,
"rewards/final_brier_reward_long_step": 0.7638988494873047,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8275418281555176,
"step": 138
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 642.0,
"completions/max_terminated_length": 642.0,
"completions/mean_length": 308.921875,
"completions/mean_terminated_length": 308.921875,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.2224,
"grad_norm": 0.03901790454983711,
"learning_rate": 3.388888888888889e-07,
"loss": 0.0065,
"num_tokens": 68669452.0,
"reward": 1.2002668380737305,
"reward_std": 0.15861909091472626,
"rewards/accuracy_reward_long_step": 0.296875,
"rewards/final_brier_reward_long_step": 0.8035297393798828,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8100374341011047,
"step": 139
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 648.0,
"completions/max_terminated_length": 648.0,
"completions/mean_length": 304.37109375,
"completions/mean_terminated_length": 304.37109375,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.224,
"grad_norm": 0.028289683163166046,
"learning_rate": 3.333333333333333e-07,
"loss": -0.0064,
"num_tokens": 69179523.0,
"reward": 1.3940101861953735,
"reward_std": 0.18012776970863342,
"rewards/accuracy_reward_long_step": 0.51953125,
"rewards/final_brier_reward_long_step": 0.7299957275390625,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7679198980331421,
"step": 140
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 715.0,
"completions/max_terminated_length": 715.0,
"completions/mean_length": 301.6328125,
"completions/mean_terminated_length": 301.6328125,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.2256,
"grad_norm": 0.032248951494693756,
"learning_rate": 3.2777777777777776e-07,
"loss": -0.0042,
"num_tokens": 69689229.0,
"reward": 1.339975357055664,
"reward_std": 0.1697162687778473,
"rewards/accuracy_reward_long_step": 0.45703125,
"rewards/final_brier_reward_long_step": 0.754450798034668,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7773258686065674,
"step": 141
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 555.0,
"completions/max_terminated_length": 555.0,
"completions/mean_length": 308.71875,
"completions/mean_terminated_length": 308.71875,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.2272,
"grad_norm": 0.0788806900382042,
"learning_rate": 3.222222222222222e-07,
"loss": 0.0148,
"num_tokens": 70210885.0,
"reward": 1.3627147674560547,
"reward_std": 0.17580462992191315,
"rewards/accuracy_reward_long_step": 0.484375,
"rewards/final_brier_reward_long_step": 0.7406051158905029,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7727540731430054,
"step": 142
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 619.0,
"completions/max_terminated_length": 619.0,
"completions/mean_length": 307.53125,
"completions/mean_terminated_length": 307.53125,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.2288,
"grad_norm": 0.02919752337038517,
"learning_rate": 3.166666666666666e-07,
"loss": 0.0041,
"num_tokens": 70731837.0,
"reward": 1.425824761390686,
"reward_std": 0.2417382448911667,
"rewards/accuracy_reward_long_step": 0.55859375,
"rewards/final_brier_reward_long_step": 0.6927086114883423,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.776215672492981,
"step": 143
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 748.0,
"completions/max_terminated_length": 748.0,
"completions/mean_length": 303.9921875,
"completions/mean_terminated_length": 303.9921875,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.2304,
"grad_norm": 0.02776823192834854,
"learning_rate": 3.111111111111111e-07,
"loss": 0.0044,
"num_tokens": 71232115.0,
"reward": 1.3090643882751465,
"reward_std": 0.18515193462371826,
"rewards/accuracy_reward_long_step": 0.41796875,
"rewards/final_brier_reward_long_step": 0.7715655565261841,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7928170561790466,
"step": 144
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 728.0,
"completions/max_terminated_length": 728.0,
"completions/mean_length": 299.80859375,
"completions/mean_terminated_length": 299.80859375,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.232,
"grad_norm": 0.030704284086823463,
"learning_rate": 3.055555555555556e-07,
"loss": 0.0199,
"num_tokens": 71750330.0,
"reward": 1.3886297941207886,
"reward_std": 0.21446546912193298,
"rewards/accuracy_reward_long_step": 0.5,
"rewards/final_brier_reward_long_step": 0.7539929747581482,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.80052649974823,
"step": 145
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 530.0,
"completions/max_terminated_length": 530.0,
"completions/mean_length": 300.62109375,
"completions/mean_terminated_length": 300.62109375,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.2336,
"grad_norm": 0.031654492020606995,
"learning_rate": 3e-07,
"loss": -0.0019,
"num_tokens": 72260633.0,
"reward": 1.4397811889648438,
"reward_std": 0.20938704907894135,
"rewards/accuracy_reward_long_step": 0.5625,
"rewards/final_brier_reward_long_step": 0.7215574383735657,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7875672578811646,
"step": 146
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 858.0,
"completions/max_terminated_length": 858.0,
"completions/mean_length": 306.19921875,
"completions/mean_terminated_length": 306.19921875,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.2352,
"grad_norm": 0.19715657830238342,
"learning_rate": 2.9444444444444444e-07,
"loss": 0.0069,
"num_tokens": 72758188.0,
"reward": 1.4754751920700073,
"reward_std": 0.26457875967025757,
"rewards/accuracy_reward_long_step": 0.60546875,
"rewards/final_brier_reward_long_step": 0.6992788910865784,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7807470560073853,
"step": 147
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 594.0,
"completions/max_terminated_length": 594.0,
"completions/mean_length": 307.36328125,
"completions/mean_terminated_length": 307.36328125,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.2368,
"grad_norm": 0.03840313479304314,
"learning_rate": 2.8888888888888885e-07,
"loss": -0.0044,
"num_tokens": 73264385.0,
"reward": 1.513301134109497,
"reward_std": 0.1907767504453659,
"rewards/accuracy_reward_long_step": 0.6484375,
"rewards/final_brier_reward_long_step": 0.7094078063964844,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7500470876693726,
"step": 148
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 612.0,
"completions/max_terminated_length": 612.0,
"completions/mean_length": 303.8828125,
"completions/mean_terminated_length": 303.8828125,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.2384,
"grad_norm": 0.027224667370319366,
"learning_rate": 2.833333333333333e-07,
"loss": -0.0015,
"num_tokens": 73775123.0,
"reward": 1.2874629497528076,
"reward_std": 0.16957233846187592,
"rewards/accuracy_reward_long_step": 0.39453125,
"rewards/final_brier_reward_long_step": 0.7941582202911377,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7853813767433167,
"step": 149
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 597.0,
"completions/max_terminated_length": 597.0,
"completions/mean_length": 312.90625,
"completions/mean_terminated_length": 312.90625,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.24,
"grad_norm": 0.04304026812314987,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.0097,
"num_tokens": 74265443.0,
"reward": 1.3638887405395508,
"reward_std": 0.27244532108306885,
"rewards/accuracy_reward_long_step": 0.4765625,
"rewards/final_brier_reward_long_step": 0.7445312738418579,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8047733306884766,
"step": 150
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 667.0,
"completions/max_terminated_length": 667.0,
"completions/mean_length": 297.63671875,
"completions/mean_terminated_length": 297.63671875,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.2416,
"grad_norm": 0.03442637622356415,
"learning_rate": 2.7222222222222216e-07,
"loss": -0.0125,
"num_tokens": 74766262.0,
"reward": 1.455894947052002,
"reward_std": 0.17580869793891907,
"rewards/accuracy_reward_long_step": 0.5703125,
"rewards/final_brier_reward_long_step": 0.7380698919296265,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8042596578598022,
"step": 151
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 687.0,
"completions/max_terminated_length": 687.0,
"completions/mean_length": 299.35546875,
"completions/mean_terminated_length": 299.35546875,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.2432,
"grad_norm": 0.02799561619758606,
"learning_rate": 2.6666666666666667e-07,
"loss": -0.0007,
"num_tokens": 75254505.0,
"reward": 1.2745752334594727,
"reward_std": 0.1415172815322876,
"rewards/accuracy_reward_long_step": 0.3828125,
"rewards/final_brier_reward_long_step": 0.7780320644378662,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.789018988609314,
"step": 152
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 644.0,
"completions/max_terminated_length": 644.0,
"completions/mean_length": 298.35546875,
"completions/mean_terminated_length": 298.35546875,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.2448,
"grad_norm": 0.03639660403132439,
"learning_rate": 2.6111111111111113e-07,
"loss": 0.0153,
"num_tokens": 75757548.0,
"reward": 1.4095165729522705,
"reward_std": 0.24766717851161957,
"rewards/accuracy_reward_long_step": 0.53125,
"rewards/final_brier_reward_long_step": 0.7201319932937622,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7929338216781616,
"step": 153
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 494.0,
"completions/max_terminated_length": 494.0,
"completions/mean_length": 293.26953125,
"completions/mean_terminated_length": 293.26953125,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.2464,
"grad_norm": 0.026929931715130806,
"learning_rate": 2.5555555555555553e-07,
"loss": -0.0083,
"num_tokens": 76259353.0,
"reward": 1.49515962600708,
"reward_std": 0.14950111508369446,
"rewards/accuracy_reward_long_step": 0.6171875,
"rewards/final_brier_reward_long_step": 0.720660924911499,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.791227400302887,
"step": 154
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 639.0,
"completions/max_terminated_length": 639.0,
"completions/mean_length": 294.78515625,
"completions/mean_terminated_length": 294.78515625,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.248,
"grad_norm": 0.060369208455085754,
"learning_rate": 2.5e-07,
"loss": 0.0095,
"num_tokens": 76766810.0,
"reward": 1.409111738204956,
"reward_std": 0.20820903778076172,
"rewards/accuracy_reward_long_step": 0.5234375,
"rewards/final_brier_reward_long_step": 0.754862904548645,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7956463098526001,
"step": 155
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 586.0,
"completions/max_terminated_length": 586.0,
"completions/mean_length": 314.47265625,
"completions/mean_terminated_length": 315.7059020996094,
"completions/min_length": 0.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.2496,
"grad_norm": 0.031120678409934044,
"learning_rate": 2.4444444444444445e-07,
"loss": -0.0077,
"num_tokens": 77273011.0,
"reward": 1.3123760223388672,
"reward_std": 0.2050066739320755,
"rewards/accuracy_reward_long_step": 0.43359375,
"rewards/final_brier_reward_long_step": 0.7430413961410522,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.779900074005127,
"step": 156
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 596.0,
"completions/max_terminated_length": 596.0,
"completions/mean_length": 305.6796875,
"completions/mean_terminated_length": 305.6796875,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.2512,
"grad_norm": 0.028397539630532265,
"learning_rate": 2.388888888888889e-07,
"loss": 0.0104,
"num_tokens": 77771369.0,
"reward": 1.4772114753723145,
"reward_std": 0.20590314269065857,
"rewards/accuracy_reward_long_step": 0.59375,
"rewards/final_brier_reward_long_step": 0.7362457513809204,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7976003289222717,
"step": 157
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 527.0,
"completions/max_terminated_length": 527.0,
"completions/mean_length": 312.51171875,
"completions/mean_terminated_length": 312.51171875,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.2528,
"grad_norm": 0.03033839352428913,
"learning_rate": 2.3333333333333333e-07,
"loss": -0.0084,
"num_tokens": 78293932.0,
"reward": 1.250650405883789,
"reward_std": 0.20709839463233948,
"rewards/accuracy_reward_long_step": 0.3515625,
"rewards/final_brier_reward_long_step": 0.780035138130188,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8163163661956787,
"step": 158
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 705.0,
"completions/max_terminated_length": 705.0,
"completions/mean_length": 311.6640625,
"completions/mean_terminated_length": 311.6640625,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.2544,
"grad_norm": 0.03519049659371376,
"learning_rate": 2.2777777777777776e-07,
"loss": -0.0036,
"num_tokens": 78801126.0,
"reward": 1.2244961261749268,
"reward_std": 0.17444197833538055,
"rewards/accuracy_reward_long_step": 0.31640625,
"rewards/final_brier_reward_long_step": 0.8095117211341858,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.822847843170166,
"step": 159
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 610.0,
"completions/max_terminated_length": 610.0,
"completions/mean_length": 294.1328125,
"completions/mean_terminated_length": 294.1328125,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.256,
"grad_norm": 0.04099366441369057,
"learning_rate": 2.222222222222222e-07,
"loss": -0.0079,
"num_tokens": 79294264.0,
"reward": 1.3805627822875977,
"reward_std": 0.18659856915473938,
"rewards/accuracy_reward_long_step": 0.48828125,
"rewards/final_brier_reward_long_step": 0.7798925638198853,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.789233922958374,
"step": 160
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 563.0,
"completions/max_terminated_length": 563.0,
"completions/mean_length": 312.27734375,
"completions/mean_terminated_length": 312.27734375,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.2576,
"grad_norm": 0.05054928734898567,
"learning_rate": 2.1666666666666667e-07,
"loss": -0.0004,
"num_tokens": 79797063.0,
"reward": 1.3297412395477295,
"reward_std": 0.17361152172088623,
"rewards/accuracy_reward_long_step": 0.4375,
"rewards/final_brier_reward_long_step": 0.766510546207428,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.802454948425293,
"step": 161
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 709.0,
"completions/max_terminated_length": 709.0,
"completions/mean_length": 310.8359375,
"completions/mean_terminated_length": 310.8359375,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.2592,
"grad_norm": 0.02891719527542591,
"learning_rate": 2.111111111111111e-07,
"loss": -0.003,
"num_tokens": 80291645.0,
"reward": 1.264817237854004,
"reward_std": 0.187312513589859,
"rewards/accuracy_reward_long_step": 0.375,
"rewards/final_brier_reward_long_step": 0.7664257884025574,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7928431034088135,
"step": 162
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 673.0,
"completions/max_terminated_length": 673.0,
"completions/mean_length": 312.3671875,
"completions/mean_terminated_length": 312.3671875,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"epoch": 0.2608,
"grad_norm": 0.040000658482313156,
"learning_rate": 2.0555555555555553e-07,
"loss": 0.0004,
"num_tokens": 80794195.0,
"reward": 1.4218961000442505,
"reward_std": 0.21327157318592072,
"rewards/accuracy_reward_long_step": 0.53125,
"rewards/final_brier_reward_long_step": 0.7526370882987976,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8099474906921387,
"step": 163
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 742.0,
"completions/max_terminated_length": 742.0,
"completions/mean_length": 309.671875,
"completions/mean_terminated_length": 309.671875,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.2624,
"grad_norm": 0.03159433230757713,
"learning_rate": 2e-07,
"loss": 0.0153,
"num_tokens": 81305319.0,
"reward": 1.4856319427490234,
"reward_std": 0.19633854925632477,
"rewards/accuracy_reward_long_step": 0.60546875,
"rewards/final_brier_reward_long_step": 0.7205570340156555,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8000956773757935,
"step": 164
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 948.0,
"completions/max_terminated_length": 948.0,
"completions/mean_length": 313.97265625,
"completions/mean_terminated_length": 313.97265625,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.264,
"grad_norm": 0.027477793395519257,
"learning_rate": 1.9444444444444445e-07,
"loss": -0.0033,
"num_tokens": 81799832.0,
"reward": 1.4248054027557373,
"reward_std": 0.2430860996246338,
"rewards/accuracy_reward_long_step": 0.5390625,
"rewards/final_brier_reward_long_step": 0.722133219242096,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8208386898040771,
"step": 165
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 706.0,
"completions/max_terminated_length": 706.0,
"completions/mean_length": 311.08203125,
"completions/mean_terminated_length": 311.08203125,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.2656,
"grad_norm": 0.09807036817073822,
"learning_rate": 1.8888888888888888e-07,
"loss": -0.0055,
"num_tokens": 82305733.0,
"reward": 1.256960391998291,
"reward_std": 0.20458321273326874,
"rewards/accuracy_reward_long_step": 0.36328125,
"rewards/final_brier_reward_long_step": 0.7870085835456848,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7877079248428345,
"step": 166
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 811.0,
"completions/max_terminated_length": 811.0,
"completions/mean_length": 318.3828125,
"completions/mean_terminated_length": 318.3828125,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.2672,
"grad_norm": 0.02899632230401039,
"learning_rate": 1.833333333333333e-07,
"loss": -0.0024,
"num_tokens": 82827687.0,
"reward": 1.2728254795074463,
"reward_std": 0.17899630963802338,
"rewards/accuracy_reward_long_step": 0.3828125,
"rewards/final_brier_reward_long_step": 0.7677257657051086,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7923259735107422,
"step": 167
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 574.0,
"completions/max_terminated_length": 574.0,
"completions/mean_length": 312.75,
"completions/mean_terminated_length": 312.75,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.2688,
"grad_norm": 0.026410935446619987,
"learning_rate": 1.7777777777777776e-07,
"loss": 0.0074,
"num_tokens": 83331319.0,
"reward": 1.300147294998169,
"reward_std": 0.16780969500541687,
"rewards/accuracy_reward_long_step": 0.40234375,
"rewards/final_brier_reward_long_step": 0.7630214691162109,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8281925916671753,
"step": 168
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 561.0,
"completions/max_terminated_length": 561.0,
"completions/mean_length": 314.34375,
"completions/mean_terminated_length": 314.34375,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.2704,
"grad_norm": 0.048373743891716,
"learning_rate": 1.7222222222222222e-07,
"loss": 0.0079,
"num_tokens": 83842919.0,
"reward": 1.2883470058441162,
"reward_std": 0.19052302837371826,
"rewards/accuracy_reward_long_step": 0.390625,
"rewards/final_brier_reward_long_step": 0.7791671752929688,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.81172114610672,
"step": 169
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 600.0,
"completions/max_terminated_length": 600.0,
"completions/mean_length": 309.91796875,
"completions/mean_terminated_length": 309.91796875,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.272,
"grad_norm": 0.0328993983566761,
"learning_rate": 1.6666666666666665e-07,
"loss": 0.014,
"num_tokens": 84327362.0,
"reward": 1.3099801540374756,
"reward_std": 0.1611907184123993,
"rewards/accuracy_reward_long_step": 0.41796875,
"rewards/final_brier_reward_long_step": 0.7775378823280334,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7983198761940002,
"step": 170
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 730.0,
"completions/max_terminated_length": 730.0,
"completions/mean_length": 319.8203125,
"completions/mean_terminated_length": 319.8203125,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.2736,
"grad_norm": 0.0332467257976532,
"learning_rate": 1.611111111111111e-07,
"loss": -0.0146,
"num_tokens": 84835268.0,
"reward": 1.339202642440796,
"reward_std": 0.20696350932121277,
"rewards/accuracy_reward_long_step": 0.453125,
"rewards/final_brier_reward_long_step": 0.750048041343689,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7942621111869812,
"step": 171
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 686.0,
"completions/max_terminated_length": 686.0,
"completions/mean_length": 324.1171875,
"completions/mean_terminated_length": 324.1171875,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.2752,
"grad_norm": 0.05583081394433975,
"learning_rate": 1.5555555555555556e-07,
"loss": -0.0116,
"num_tokens": 85354394.0,
"reward": 1.2257599830627441,
"reward_std": 0.1692497879266739,
"rewards/accuracy_reward_long_step": 0.33203125,
"rewards/final_brier_reward_long_step": 0.7842777371406555,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7906370162963867,
"step": 172
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 627.0,
"completions/max_terminated_length": 627.0,
"completions/mean_length": 304.73828125,
"completions/mean_terminated_length": 304.73828125,
"completions/min_length": 123.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.2768,
"grad_norm": 0.03522387892007828,
"learning_rate": 1.5e-07,
"loss": 0.0098,
"num_tokens": 85859007.0,
"reward": 1.3143270015716553,
"reward_std": 0.23036913573741913,
"rewards/accuracy_reward_long_step": 0.41796875,
"rewards/final_brier_reward_long_step": 0.7498409748077393,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8355921506881714,
"step": 173
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 561.0,
"completions/max_terminated_length": 561.0,
"completions/mean_length": 302.87109375,
"completions/mean_terminated_length": 302.87109375,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.2784,
"grad_norm": 0.027995487675070763,
"learning_rate": 1.4444444444444442e-07,
"loss": -0.0041,
"num_tokens": 86369230.0,
"reward": 1.2683026790618896,
"reward_std": 0.22589144110679626,
"rewards/accuracy_reward_long_step": 0.3828125,
"rewards/final_brier_reward_long_step": 0.7601765394210815,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7817836999893188,
"step": 174
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 688.0,
"completions/max_terminated_length": 688.0,
"completions/mean_length": 307.44921875,
"completions/mean_terminated_length": 307.44921875,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.28,
"grad_norm": 0.028536871075630188,
"learning_rate": 1.3888888888888888e-07,
"loss": 0.0019,
"num_tokens": 86862377.0,
"reward": 1.4214547872543335,
"reward_std": 0.19105279445648193,
"rewards/accuracy_reward_long_step": 0.53125,
"rewards/final_brier_reward_long_step": 0.7514737844467163,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8093452453613281,
"step": 175
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 657.0,
"completions/max_terminated_length": 657.0,
"completions/mean_length": 306.5390625,
"completions/mean_terminated_length": 306.5390625,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.2816,
"grad_norm": 0.027810126543045044,
"learning_rate": 1.3333333333333334e-07,
"loss": -0.0032,
"num_tokens": 87353779.0,
"reward": 1.3975551128387451,
"reward_std": 0.1693097949028015,
"rewards/accuracy_reward_long_step": 0.51171875,
"rewards/final_brier_reward_long_step": 0.7351202964782715,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8082250952720642,
"step": 176
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 726.0,
"completions/max_terminated_length": 726.0,
"completions/mean_length": 302.92578125,
"completions/mean_terminated_length": 302.92578125,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.2832,
"grad_norm": 0.027919691056013107,
"learning_rate": 1.2777777777777777e-07,
"loss": -0.0057,
"num_tokens": 87859856.0,
"reward": 1.3061156272888184,
"reward_std": 0.17378166317939758,
"rewards/accuracy_reward_long_step": 0.421875,
"rewards/final_brier_reward_long_step": 0.7432960867881775,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7936666011810303,
"step": 177
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 625.0,
"completions/max_terminated_length": 625.0,
"completions/mean_length": 317.79296875,
"completions/mean_terminated_length": 317.79296875,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.2848,
"grad_norm": 0.026690706610679626,
"learning_rate": 1.2222222222222222e-07,
"loss": 0.007,
"num_tokens": 88358995.0,
"reward": 1.3103278875350952,
"reward_std": 0.1235455721616745,
"rewards/accuracy_reward_long_step": 0.41015625,
"rewards/final_brier_reward_long_step": 0.7893308401107788,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8113558292388916,
"step": 178
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 582.0,
"completions/max_terminated_length": 582.0,
"completions/mean_length": 304.3828125,
"completions/mean_terminated_length": 304.3828125,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.2864,
"grad_norm": 0.0654895007610321,
"learning_rate": 1.1666666666666667e-07,
"loss": -0.0002,
"num_tokens": 88861517.0,
"reward": 1.3416030406951904,
"reward_std": 0.18121816217899323,
"rewards/accuracy_reward_long_step": 0.44921875,
"rewards/final_brier_reward_long_step": 0.7645456790924072,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.8128038644790649,
"step": 179
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 631.0,
"completions/max_terminated_length": 631.0,
"completions/mean_length": 303.37890625,
"completions/mean_terminated_length": 303.37890625,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.288,
"grad_norm": 0.029484573751688004,
"learning_rate": 1.111111111111111e-07,
"loss": 0.001,
"num_tokens": 89350494.0,
"reward": 1.4031870365142822,
"reward_std": 0.20662115514278412,
"rewards/accuracy_reward_long_step": 0.52734375,
"rewards/final_brier_reward_long_step": 0.7230523228645325,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7803207039833069,
"step": 180
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 667.0,
"completions/max_terminated_length": 667.0,
"completions/mean_length": 311.08984375,
"completions/mean_terminated_length": 311.08984375,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.2896,
"grad_norm": 0.06027643755078316,
"learning_rate": 1.0555555555555555e-07,
"loss": 0.0061,
"num_tokens": 89854605.0,
"reward": 1.3537577390670776,
"reward_std": 0.21486344933509827,
"rewards/accuracy_reward_long_step": 0.47265625,
"rewards/final_brier_reward_long_step": 0.7450315952301025,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7871868014335632,
"step": 181
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 754.0,
"completions/max_terminated_length": 754.0,
"completions/mean_length": 304.125,
"completions/mean_terminated_length": 304.125,
"completions/min_length": 122.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.2912,
"grad_norm": 0.027213167399168015,
"learning_rate": 1e-07,
"loss": -0.0018,
"num_tokens": 90350773.0,
"reward": 1.3099863529205322,
"reward_std": 0.2428930103778839,
"rewards/accuracy_reward_long_step": 0.43359375,
"rewards/final_brier_reward_long_step": 0.7415887117385864,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.7796065807342529,
"step": 182
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 606.0,
"completions/max_terminated_length": 606.0,
"completions/mean_length": 304.78515625,
"completions/mean_terminated_length": 304.78515625,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.2928,
"grad_norm": 0.028131412342190742,
"learning_rate": 9.444444444444444e-08,
"loss": -0.0014,
"num_tokens": 90842758.0,
"reward": 1.399085521697998,
"reward_std": 0.13641595840454102,
"rewards/accuracy_reward_long_step": 0.51171875,
"rewards/final_brier_reward_long_step": 0.7427164316177368,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8067511916160583,
"step": 183
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 735.0,
"completions/max_terminated_length": 735.0,
"completions/mean_length": 319.765625,
"completions/mean_terminated_length": 319.765625,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.2944,
"grad_norm": 0.03541647642850876,
"learning_rate": 8.888888888888888e-08,
"loss": 0.0052,
"num_tokens": 91348666.0,
"reward": 1.3956681489944458,
"reward_std": 0.1705297827720642,
"rewards/accuracy_reward_long_step": 0.515625,
"rewards/final_brier_reward_long_step": 0.7347894906997681,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.7931956052780151,
"step": 184
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 718.0,
"completions/max_terminated_length": 718.0,
"completions/mean_length": 314.33203125,
"completions/mean_terminated_length": 314.33203125,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.296,
"grad_norm": 0.032346244901418686,
"learning_rate": 8.333333333333333e-08,
"loss": -0.0086,
"num_tokens": 91854359.0,
"reward": 1.2988712787628174,
"reward_std": 0.18167896568775177,
"rewards/accuracy_reward_long_step": 0.41796875,
"rewards/final_brier_reward_long_step": 0.7545672059059143,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7690430879592896,
"step": 185
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 734.0,
"completions/max_terminated_length": 734.0,
"completions/mean_length": 307.546875,
"completions/mean_terminated_length": 307.546875,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.2976,
"grad_norm": 0.027610866352915764,
"learning_rate": 7.777777777777778e-08,
"loss": 0.0048,
"num_tokens": 92349963.0,
"reward": 1.4005792140960693,
"reward_std": 0.19216248393058777,
"rewards/accuracy_reward_long_step": 0.515625,
"rewards/final_brier_reward_long_step": 0.7557350993156433,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7840815782546997,
"step": 186
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 635.0,
"completions/max_terminated_length": 635.0,
"completions/mean_length": 320.875,
"completions/mean_terminated_length": 320.875,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.2992,
"grad_norm": 0.027752123773097992,
"learning_rate": 7.222222222222221e-08,
"loss": 0.0079,
"num_tokens": 92853435.0,
"reward": 1.3692231178283691,
"reward_std": 0.2207336574792862,
"rewards/accuracy_reward_long_step": 0.484375,
"rewards/final_brier_reward_long_step": 0.7303339838981628,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.8168707489967346,
"step": 187
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 672.0,
"completions/max_terminated_length": 672.0,
"completions/mean_length": 323.48046875,
"completions/mean_terminated_length": 323.48046875,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.3008,
"grad_norm": 0.03371801972389221,
"learning_rate": 6.666666666666667e-08,
"loss": 0.0037,
"num_tokens": 93376046.0,
"reward": 1.347980260848999,
"reward_std": 0.1777208149433136,
"rewards/accuracy_reward_long_step": 0.4609375,
"rewards/final_brier_reward_long_step": 0.7578449249267578,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7903263568878174,
"step": 188
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 613.0,
"completions/max_terminated_length": 613.0,
"completions/mean_length": 312.94140625,
"completions/mean_terminated_length": 312.94140625,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.3024,
"grad_norm": 0.02993335761129856,
"learning_rate": 6.111111111111111e-08,
"loss": -0.001,
"num_tokens": 93872863.0,
"reward": 1.372868299484253,
"reward_std": 0.10951399803161621,
"rewards/accuracy_reward_long_step": 0.484375,
"rewards/final_brier_reward_long_step": 0.7652988433837891,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7886741161346436,
"step": 189
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 623.0,
"completions/max_terminated_length": 623.0,
"completions/mean_length": 316.453125,
"completions/mean_terminated_length": 316.453125,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.304,
"grad_norm": 0.038850028067827225,
"learning_rate": 5.555555555555555e-08,
"loss": -0.0096,
"num_tokens": 94377163.0,
"reward": 1.2374032735824585,
"reward_std": 0.2162398397922516,
"rewards/accuracy_reward_long_step": 0.34765625,
"rewards/final_brier_reward_long_step": 0.7705594301223755,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7884289026260376,
"step": 190
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 759.0,
"completions/max_terminated_length": 759.0,
"completions/mean_length": 310.67578125,
"completions/mean_terminated_length": 310.67578125,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.3056,
"grad_norm": 0.09805744141340256,
"learning_rate": 5e-08,
"loss": -0.0029,
"num_tokens": 94885656.0,
"reward": 1.3634986877441406,
"reward_std": 0.14901340007781982,
"rewards/accuracy_reward_long_step": 0.4765625,
"rewards/final_brier_reward_long_step": 0.7494964599609375,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.8060606718063354,
"step": 191
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 655.0,
"completions/max_terminated_length": 655.0,
"completions/mean_length": 323.4453125,
"completions/mean_terminated_length": 323.4453125,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.3072,
"grad_norm": 0.036289751529693604,
"learning_rate": 4.444444444444444e-08,
"loss": 0.0008,
"num_tokens": 95401338.0,
"reward": 1.1621966361999512,
"reward_std": 0.17893055081367493,
"rewards/accuracy_reward_long_step": 0.2578125,
"rewards/final_brier_reward_long_step": 0.8198968768119812,
"rewards/format_reward_long_step": 0.9921875,
"rewards/stepwise_brier_reward_long_step": 0.8132648468017578,
"step": 192
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 616.0,
"completions/max_terminated_length": 616.0,
"completions/mean_length": 317.6953125,
"completions/mean_terminated_length": 317.6953125,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.3088,
"grad_norm": 0.033580850809812546,
"learning_rate": 3.888888888888889e-08,
"loss": 0.0163,
"num_tokens": 95915484.0,
"reward": 1.4245002269744873,
"reward_std": 0.18409396708011627,
"rewards/accuracy_reward_long_step": 0.54296875,
"rewards/final_brier_reward_long_step": 0.7327523231506348,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.793373167514801,
"step": 193
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 585.0,
"completions/max_terminated_length": 585.0,
"completions/mean_length": 319.3046875,
"completions/mean_terminated_length": 319.3046875,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.3104,
"grad_norm": 0.0271166805177927,
"learning_rate": 3.3333333333333334e-08,
"loss": 0.0106,
"num_tokens": 96427698.0,
"reward": 1.3507544994354248,
"reward_std": 0.23310068249702454,
"rewards/accuracy_reward_long_step": 0.4609375,
"rewards/final_brier_reward_long_step": 0.7412347197532654,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.8258461356163025,
"step": 194
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 563.0,
"completions/max_terminated_length": 563.0,
"completions/mean_length": 311.0546875,
"completions/mean_terminated_length": 311.0546875,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.312,
"grad_norm": 0.029426012188196182,
"learning_rate": 2.7777777777777774e-08,
"loss": 0.0089,
"num_tokens": 96932352.0,
"reward": 1.4266903400421143,
"reward_std": 0.17993581295013428,
"rewards/accuracy_reward_long_step": 0.54296875,
"rewards/final_brier_reward_long_step": 0.7373980283737183,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.8053007125854492,
"step": 195
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 629.0,
"completions/max_terminated_length": 629.0,
"completions/mean_length": 316.51171875,
"completions/mean_terminated_length": 316.51171875,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.3136,
"grad_norm": 0.02578766457736492,
"learning_rate": 2.222222222222222e-08,
"loss": 0.0107,
"num_tokens": 97441803.0,
"reward": 1.5241920948028564,
"reward_std": 0.19832386076450348,
"rewards/accuracy_reward_long_step": 0.64453125,
"rewards/final_brier_reward_long_step": 0.7144637107849121,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.804179847240448,
"step": 196
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 578.0,
"completions/max_terminated_length": 578.0,
"completions/mean_length": 305.58984375,
"completions/mean_terminated_length": 305.58984375,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.3152,
"grad_norm": 0.03043382056057453,
"learning_rate": 1.6666666666666667e-08,
"loss": 0.0183,
"num_tokens": 97947882.0,
"reward": 1.3735759258270264,
"reward_std": 0.19583386182785034,
"rewards/accuracy_reward_long_step": 0.47265625,
"rewards/final_brier_reward_long_step": 0.7926558256149292,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.8110226988792419,
"step": 197
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 606.0,
"completions/max_terminated_length": 606.0,
"completions/mean_length": 303.71484375,
"completions/mean_terminated_length": 303.71484375,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.3168,
"grad_norm": 0.028867207467556,
"learning_rate": 1.111111111111111e-08,
"loss": -0.0018,
"num_tokens": 98453969.0,
"reward": 1.3594614267349243,
"reward_std": 0.21242399513721466,
"rewards/accuracy_reward_long_step": 0.47265625,
"rewards/final_brier_reward_long_step": 0.7498824000358582,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7973384261131287,
"step": 198
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 614.0,
"completions/max_terminated_length": 614.0,
"completions/mean_length": 315.70703125,
"completions/mean_terminated_length": 315.70703125,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.3184,
"grad_norm": 0.045795369893312454,
"learning_rate": 5.555555555555555e-09,
"loss": -0.0012,
"num_tokens": 98964222.0,
"reward": 1.3441438674926758,
"reward_std": 0.17573949694633484,
"rewards/accuracy_reward_long_step": 0.453125,
"rewards/final_brier_reward_long_step": 0.7653417587280273,
"rewards/format_reward_long_step": 1.0,
"rewards/stepwise_brier_reward_long_step": 0.7987333536148071,
"step": 199
},
{
"calib/answer_extract_rate": 0.0,
"calib/avg_num_step_conf": 0.0,
"calib/final_conf_rate": 0.0,
"calib/format_rate": 0.0,
"calib/nonempty_final_conf_rate": 0.0,
"calib/nonempty_reasoning_rate": 0.0,
"calib/nonempty_step_conf_rate": 0.0,
"calib/step_conf_rate": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 577.0,
"completions/max_terminated_length": 577.0,
"completions/mean_length": 304.7421875,
"completions/mean_terminated_length": 305.9372863769531,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.32,
"grad_norm": 0.025520017370581627,
"learning_rate": 0.0,
"loss": -0.018,
"num_tokens": 99441740.0,
"reward": 1.34486985206604,
"reward_std": 0.1514306366443634,
"rewards/accuracy_reward_long_step": 0.45703125,
"rewards/final_brier_reward_long_step": 0.7545433640480042,
"rewards/format_reward_long_step": 0.99609375,
"rewards/stepwise_brier_reward_long_step": 0.8046233654022217,
"step": 200
},
{
"epoch": 0.32,
"step": 200,
"total_flos": 0.0,
"train_loss": -0.016620432519121094,
"train_runtime": 11304.4046,
"train_samples_per_second": 4.529,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 99441740,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}