21347 lines
807 KiB
JSON
21347 lines
807 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 625,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1640625,
|
|
"completions/max_length": 1000.0,
|
|
"completions/max_terminated_length": 1000.0,
|
|
"completions/mean_length": 341.4609375,
|
|
"completions/mean_terminated_length": 408.47662353515625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.0016,
|
|
"grad_norm": 0.17134852707386017,
|
|
"learning_rate": 1.5873015873015872e-08,
|
|
"loss": -0.116,
|
|
"num_tokens": 486582.0,
|
|
"reward": 0.41310209035873413,
|
|
"reward_std": 0.4805126190185547,
|
|
"rewards/accuracy_reward_long_step": 0.2265625,
|
|
"rewards/final_brier_reward_long_step": 0.11814829707145691,
|
|
"rewards/format_reward_long_step": 0.23046875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.1670725792646408,
|
|
"step": 1
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.19140625,
|
|
"completions/max_length": 1019.0,
|
|
"completions/max_terminated_length": 1019.0,
|
|
"completions/mean_length": 303.75,
|
|
"completions/mean_terminated_length": 375.65216064453125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0032,
|
|
"grad_norm": 0.6887706518173218,
|
|
"learning_rate": 3.1746031746031744e-08,
|
|
"loss": -0.1486,
|
|
"num_tokens": 985630.0,
|
|
"reward": 0.4098304212093353,
|
|
"reward_std": 0.5015645623207092,
|
|
"rewards/accuracy_reward_long_step": 0.1875,
|
|
"rewards/final_brier_reward_long_step": 0.1355031430721283,
|
|
"rewards/format_reward_long_step": 0.27734375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.1991310715675354,
|
|
"step": 2
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.15234375,
|
|
"completions/max_length": 1022.0,
|
|
"completions/max_terminated_length": 1022.0,
|
|
"completions/mean_length": 353.46484375,
|
|
"completions/mean_terminated_length": 416.99078369140625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0048,
|
|
"grad_norm": 0.411814421415329,
|
|
"learning_rate": 4.7619047619047613e-08,
|
|
"loss": -0.1027,
|
|
"num_tokens": 1490821.0,
|
|
"reward": 0.41081345081329346,
|
|
"reward_std": 0.5538315773010254,
|
|
"rewards/accuracy_reward_long_step": 0.19921875,
|
|
"rewards/final_brier_reward_long_step": 0.13149982690811157,
|
|
"rewards/format_reward_long_step": 0.25390625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.20706646144390106,
|
|
"step": 3
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.15625,
|
|
"completions/max_length": 944.0,
|
|
"completions/max_terminated_length": 944.0,
|
|
"completions/mean_length": 340.40625,
|
|
"completions/mean_terminated_length": 403.4444580078125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0064,
|
|
"grad_norm": 0.40814000368118286,
|
|
"learning_rate": 6.349206349206349e-08,
|
|
"loss": -0.0835,
|
|
"num_tokens": 2004965.0,
|
|
"reward": 0.3751431107521057,
|
|
"reward_std": 0.48189181089401245,
|
|
"rewards/accuracy_reward_long_step": 0.1875,
|
|
"rewards/final_brier_reward_long_step": 0.11413241922855377,
|
|
"rewards/format_reward_long_step": 0.23046875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.17550255358219147,
|
|
"step": 4
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.15234375,
|
|
"completions/max_length": 1001.0,
|
|
"completions/max_terminated_length": 1001.0,
|
|
"completions/mean_length": 343.70703125,
|
|
"completions/mean_terminated_length": 405.479248046875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.008,
|
|
"grad_norm": 0.37365081906318665,
|
|
"learning_rate": 7.936507936507936e-08,
|
|
"loss": -0.0435,
|
|
"num_tokens": 2528514.0,
|
|
"reward": 0.34497031569480896,
|
|
"reward_std": 0.45299145579338074,
|
|
"rewards/accuracy_reward_long_step": 0.14453125,
|
|
"rewards/final_brier_reward_long_step": 0.10058828443288803,
|
|
"rewards/format_reward_long_step": 0.26953125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.1621055006980896,
|
|
"step": 5
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1640625,
|
|
"completions/max_length": 996.0,
|
|
"completions/max_terminated_length": 996.0,
|
|
"completions/mean_length": 310.33203125,
|
|
"completions/mean_terminated_length": 371.2383117675781,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0096,
|
|
"grad_norm": 2.2686784267425537,
|
|
"learning_rate": 9.523809523809523e-08,
|
|
"loss": -0.1196,
|
|
"num_tokens": 3035623.0,
|
|
"reward": 0.39475879073143005,
|
|
"reward_std": 0.5006267428398132,
|
|
"rewards/accuracy_reward_long_step": 0.1953125,
|
|
"rewards/final_brier_reward_long_step": 0.11408085376024246,
|
|
"rewards/format_reward_long_step": 0.2421875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.19932931661605835,
|
|
"step": 6
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1640625,
|
|
"completions/max_length": 1011.0,
|
|
"completions/max_terminated_length": 1011.0,
|
|
"completions/mean_length": 349.5703125,
|
|
"completions/mean_terminated_length": 418.17755126953125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0112,
|
|
"grad_norm": 0.5209683179855347,
|
|
"learning_rate": 1.111111111111111e-07,
|
|
"loss": -0.1158,
|
|
"num_tokens": 3544593.0,
|
|
"reward": 0.36087095737457275,
|
|
"reward_std": 0.5047852993011475,
|
|
"rewards/accuracy_reward_long_step": 0.16015625,
|
|
"rewards/final_brier_reward_long_step": 0.11222599446773529,
|
|
"rewards/format_reward_long_step": 0.25390625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.18282026052474976,
|
|
"step": 7
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0078125,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0078125,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.19140625,
|
|
"completions/max_length": 1020.0,
|
|
"completions/max_terminated_length": 1020.0,
|
|
"completions/mean_length": 334.12109375,
|
|
"completions/mean_terminated_length": 413.2125549316406,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0128,
|
|
"grad_norm": 0.5285189151763916,
|
|
"learning_rate": 1.2698412698412698e-07,
|
|
"loss": -0.0977,
|
|
"num_tokens": 4034728.0,
|
|
"reward": 0.4276258945465088,
|
|
"reward_std": 0.5124700665473938,
|
|
"rewards/accuracy_reward_long_step": 0.1953125,
|
|
"rewards/final_brier_reward_long_step": 0.13241875171661377,
|
|
"rewards/format_reward_long_step": 0.2890625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.2187097817659378,
|
|
"step": 8
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.14453125,
|
|
"completions/max_length": 976.0,
|
|
"completions/max_terminated_length": 976.0,
|
|
"completions/mean_length": 361.22265625,
|
|
"completions/mean_terminated_length": 422.2511291503906,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0144,
|
|
"grad_norm": 2.121281862258911,
|
|
"learning_rate": 1.4285714285714285e-07,
|
|
"loss": -0.1102,
|
|
"num_tokens": 4560393.0,
|
|
"reward": 0.2677909731864929,
|
|
"reward_std": 0.41135305166244507,
|
|
"rewards/accuracy_reward_long_step": 0.07421875,
|
|
"rewards/final_brier_reward_long_step": 0.11064782738685608,
|
|
"rewards/format_reward_long_step": 0.25,
|
|
"rewards/stepwise_brier_reward_long_step": 0.163641095161438,
|
|
"step": 9
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 0.0,
|
|
"calib/mean_conf": 0.2,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.17578125,
|
|
"completions/max_length": 1022.0,
|
|
"completions/max_terminated_length": 1022.0,
|
|
"completions/mean_length": 344.53515625,
|
|
"completions/mean_terminated_length": 418.01422119140625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.016,
|
|
"grad_norm": 0.2448827624320984,
|
|
"learning_rate": 1.5873015873015872e-07,
|
|
"loss": -0.1562,
|
|
"num_tokens": 5068466.0,
|
|
"reward": 0.3565204441547394,
|
|
"reward_std": 0.4487614333629608,
|
|
"rewards/accuracy_reward_long_step": 0.15234375,
|
|
"rewards/final_brier_reward_long_step": 0.11602266132831573,
|
|
"rewards/format_reward_long_step": 0.26171875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.17724668979644775,
|
|
"step": 10
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1328125,
|
|
"completions/max_length": 1023.0,
|
|
"completions/max_terminated_length": 1023.0,
|
|
"completions/mean_length": 349.59765625,
|
|
"completions/mean_terminated_length": 403.1396484375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0176,
|
|
"grad_norm": 0.8030067086219788,
|
|
"learning_rate": 1.7460317460317458e-07,
|
|
"loss": -0.0554,
|
|
"num_tokens": 5589579.0,
|
|
"reward": 0.2793608009815216,
|
|
"reward_std": 0.404774010181427,
|
|
"rewards/accuracy_reward_long_step": 0.10546875,
|
|
"rewards/final_brier_reward_long_step": 0.08069999516010284,
|
|
"rewards/format_reward_long_step": 0.234375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.1461181938648224,
|
|
"step": 11
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 0.0,
|
|
"calib/mean_conf": 0.9,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.18359375,
|
|
"completions/max_length": 1011.0,
|
|
"completions/max_terminated_length": 1011.0,
|
|
"completions/mean_length": 322.5234375,
|
|
"completions/mean_terminated_length": 395.0526123046875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0192,
|
|
"grad_norm": 0.28563177585601807,
|
|
"learning_rate": 1.9047619047619045e-07,
|
|
"loss": -0.119,
|
|
"num_tokens": 6094689.0,
|
|
"reward": 0.35700535774230957,
|
|
"reward_std": 0.45222049951553345,
|
|
"rewards/accuracy_reward_long_step": 0.19140625,
|
|
"rewards/final_brier_reward_long_step": 0.09916991740465164,
|
|
"rewards/format_reward_long_step": 0.19921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.16478905081748962,
|
|
"step": 12
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.16015625,
|
|
"completions/max_length": 980.0,
|
|
"completions/max_terminated_length": 980.0,
|
|
"completions/mean_length": 322.3203125,
|
|
"completions/mean_terminated_length": 383.7860412597656,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0208,
|
|
"grad_norm": 0.1121608093380928,
|
|
"learning_rate": 2.0634920634920632e-07,
|
|
"loss": -0.0936,
|
|
"num_tokens": 6610131.0,
|
|
"reward": 0.410109281539917,
|
|
"reward_std": 0.4506571292877197,
|
|
"rewards/accuracy_reward_long_step": 0.1796875,
|
|
"rewards/final_brier_reward_long_step": 0.12741835415363312,
|
|
"rewards/format_reward_long_step": 0.296875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.2005188763141632,
|
|
"step": 13
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.171875,
|
|
"completions/max_length": 1013.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 317.0234375,
|
|
"completions/mean_terminated_length": 382.8207702636719,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0224,
|
|
"grad_norm": 0.11094717681407928,
|
|
"learning_rate": 2.222222222222222e-07,
|
|
"loss": -0.1125,
|
|
"num_tokens": 7122753.0,
|
|
"reward": 0.45112496614456177,
|
|
"reward_std": 0.50334632396698,
|
|
"rewards/accuracy_reward_long_step": 0.21484375,
|
|
"rewards/final_brier_reward_long_step": 0.14254721999168396,
|
|
"rewards/format_reward_long_step": 0.296875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.20882770419120789,
|
|
"step": 14
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1796875,
|
|
"completions/max_length": 966.0,
|
|
"completions/max_terminated_length": 966.0,
|
|
"completions/mean_length": 311.2265625,
|
|
"completions/mean_terminated_length": 379.4000244140625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.024,
|
|
"grad_norm": 0.13925662636756897,
|
|
"learning_rate": 2.3809523809523806e-07,
|
|
"loss": -0.1164,
|
|
"num_tokens": 7630235.0,
|
|
"reward": 0.31538814306259155,
|
|
"reward_std": 0.4458809494972229,
|
|
"rewards/accuracy_reward_long_step": 0.13671875,
|
|
"rewards/final_brier_reward_long_step": 0.10075005888938904,
|
|
"rewards/format_reward_long_step": 0.21875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.1764274686574936,
|
|
"step": 15
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 1.0,
|
|
"calib/mean_conf": 1.0,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1484375,
|
|
"completions/max_length": 1018.0,
|
|
"completions/max_terminated_length": 1018.0,
|
|
"completions/mean_length": 326.4453125,
|
|
"completions/mean_terminated_length": 383.3486022949219,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0256,
|
|
"grad_norm": 0.7503873109817505,
|
|
"learning_rate": 2.5396825396825396e-07,
|
|
"loss": -0.126,
|
|
"num_tokens": 8156781.0,
|
|
"reward": 0.4038216471672058,
|
|
"reward_std": 0.5121759176254272,
|
|
"rewards/accuracy_reward_long_step": 0.1875,
|
|
"rewards/final_brier_reward_long_step": 0.13070036470890045,
|
|
"rewards/format_reward_long_step": 0.26953125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.19552379846572876,
|
|
"step": 16
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1328125,
|
|
"completions/max_length": 1018.0,
|
|
"completions/max_terminated_length": 1018.0,
|
|
"completions/mean_length": 338.77734375,
|
|
"completions/mean_terminated_length": 390.66217041015625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0272,
|
|
"grad_norm": 0.06501276046037674,
|
|
"learning_rate": 2.698412698412698e-07,
|
|
"loss": -0.1059,
|
|
"num_tokens": 8640828.0,
|
|
"reward": 0.3153911828994751,
|
|
"reward_std": 0.4668186902999878,
|
|
"rewards/accuracy_reward_long_step": 0.12890625,
|
|
"rewards/final_brier_reward_long_step": 0.10800625383853912,
|
|
"rewards/format_reward_long_step": 0.234375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.16918347775936127,
|
|
"step": 17
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.17578125,
|
|
"completions/max_length": 979.0,
|
|
"completions/max_terminated_length": 979.0,
|
|
"completions/mean_length": 322.75390625,
|
|
"completions/mean_terminated_length": 391.58770751953125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0288,
|
|
"grad_norm": 0.1249445378780365,
|
|
"learning_rate": 2.857142857142857e-07,
|
|
"loss": -0.1224,
|
|
"num_tokens": 9147797.0,
|
|
"reward": 0.3471192717552185,
|
|
"reward_std": 0.4712105989456177,
|
|
"rewards/accuracy_reward_long_step": 0.14453125,
|
|
"rewards/final_brier_reward_long_step": 0.11726874858140945,
|
|
"rewards/format_reward_long_step": 0.25390625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.18527084589004517,
|
|
"step": 18
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 1.0,
|
|
"calib/mean_conf": 0.95,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.13671875,
|
|
"completions/max_length": 1004.0,
|
|
"completions/max_terminated_length": 1004.0,
|
|
"completions/mean_length": 343.99609375,
|
|
"completions/mean_terminated_length": 398.4751281738281,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0304,
|
|
"grad_norm": 0.07186749577522278,
|
|
"learning_rate": 3.0158730158730156e-07,
|
|
"loss": -0.0533,
|
|
"num_tokens": 9663764.0,
|
|
"reward": 0.379126638174057,
|
|
"reward_std": 0.49106040596961975,
|
|
"rewards/accuracy_reward_long_step": 0.1796875,
|
|
"rewards/final_brier_reward_long_step": 0.11015625298023224,
|
|
"rewards/format_reward_long_step": 0.25,
|
|
"rewards/stepwise_brier_reward_long_step": 0.18760032951831818,
|
|
"step": 19
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.140625,
|
|
"completions/max_length": 980.0,
|
|
"completions/max_terminated_length": 980.0,
|
|
"completions/mean_length": 344.03125,
|
|
"completions/mean_terminated_length": 400.3272705078125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.032,
|
|
"grad_norm": 0.1281915009021759,
|
|
"learning_rate": 3.1746031746031743e-07,
|
|
"loss": -0.0944,
|
|
"num_tokens": 10179540.0,
|
|
"reward": 0.3329862952232361,
|
|
"reward_std": 0.4402102828025818,
|
|
"rewards/accuracy_reward_long_step": 0.1328125,
|
|
"rewards/final_brier_reward_long_step": 0.11334909498691559,
|
|
"rewards/format_reward_long_step": 0.2578125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.17172113060951233,
|
|
"step": 20
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.17578125,
|
|
"completions/max_length": 927.0,
|
|
"completions/max_terminated_length": 927.0,
|
|
"completions/mean_length": 325.28515625,
|
|
"completions/mean_terminated_length": 394.6587829589844,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0336,
|
|
"grad_norm": 0.15362648665905,
|
|
"learning_rate": 3.333333333333333e-07,
|
|
"loss": -0.0859,
|
|
"num_tokens": 10689989.0,
|
|
"reward": 0.35474836826324463,
|
|
"reward_std": 0.48964905738830566,
|
|
"rewards/accuracy_reward_long_step": 0.15625,
|
|
"rewards/final_brier_reward_long_step": 0.11406318843364716,
|
|
"rewards/format_reward_long_step": 0.25,
|
|
"rewards/stepwise_brier_reward_long_step": 0.17993025481700897,
|
|
"step": 21
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.01171875,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 1.0,
|
|
"calib/mean_conf": 1.0,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.01171875,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.1953125,
|
|
"completions/max_length": 997.0,
|
|
"completions/max_terminated_length": 997.0,
|
|
"completions/mean_length": 311.5625,
|
|
"completions/mean_terminated_length": 387.1844787597656,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0352,
|
|
"grad_norm": 0.21162962913513184,
|
|
"learning_rate": 3.4920634920634917e-07,
|
|
"loss": -0.1431,
|
|
"num_tokens": 11193597.0,
|
|
"reward": 0.32876265048980713,
|
|
"reward_std": 0.4388379454612732,
|
|
"rewards/accuracy_reward_long_step": 0.1171875,
|
|
"rewards/final_brier_reward_long_step": 0.10059726983308792,
|
|
"rewards/format_reward_long_step": 0.26953125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.20664086937904358,
|
|
"step": 22
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.12890625,
|
|
"completions/max_length": 1010.0,
|
|
"completions/max_terminated_length": 1010.0,
|
|
"completions/mean_length": 347.2265625,
|
|
"completions/mean_terminated_length": 398.6098937988281,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 26.0,
|
|
"epoch": 0.0368,
|
|
"grad_norm": 0.08162181079387665,
|
|
"learning_rate": 3.6507936507936504e-07,
|
|
"loss": -0.0443,
|
|
"num_tokens": 11712407.0,
|
|
"reward": 0.3704327940940857,
|
|
"reward_std": 0.4654346704483032,
|
|
"rewards/accuracy_reward_long_step": 0.171875,
|
|
"rewards/final_brier_reward_long_step": 0.0984906256198883,
|
|
"rewards/format_reward_long_step": 0.26171875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.17230293154716492,
|
|
"step": 23
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1024.0,
|
|
"completions/mean_length": 358.25,
|
|
"completions/mean_terminated_length": 409.4285888671875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 4.0,
|
|
"epoch": 0.0384,
|
|
"grad_norm": 0.22265669703483582,
|
|
"learning_rate": 3.809523809523809e-07,
|
|
"loss": -0.0882,
|
|
"num_tokens": 12222919.0,
|
|
"reward": 0.39699164032936096,
|
|
"reward_std": 0.49701642990112305,
|
|
"rewards/accuracy_reward_long_step": 0.17578125,
|
|
"rewards/final_brier_reward_long_step": 0.11312989890575409,
|
|
"rewards/format_reward_long_step": 0.28125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.20921160280704498,
|
|
"step": 24
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 0.0,
|
|
"calib/mean_conf": 0.8,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.09375,
|
|
"completions/max_length": 1015.0,
|
|
"completions/max_terminated_length": 1015.0,
|
|
"completions/mean_length": 361.26171875,
|
|
"completions/mean_terminated_length": 398.63360595703125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.04,
|
|
"grad_norm": 0.07138670980930328,
|
|
"learning_rate": 3.968253968253968e-07,
|
|
"loss": -0.0724,
|
|
"num_tokens": 12748058.0,
|
|
"reward": 0.39820796251296997,
|
|
"reward_std": 0.4787534177303314,
|
|
"rewards/accuracy_reward_long_step": 0.14453125,
|
|
"rewards/final_brier_reward_long_step": 0.1328800767660141,
|
|
"rewards/format_reward_long_step": 0.32421875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.23338933289051056,
|
|
"step": 25
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.17578125,
|
|
"completions/max_length": 1008.0,
|
|
"completions/max_terminated_length": 1008.0,
|
|
"completions/mean_length": 326.24609375,
|
|
"completions/mean_terminated_length": 395.82464599609375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0416,
|
|
"grad_norm": 0.06991428881883621,
|
|
"learning_rate": 4.1269841269841265e-07,
|
|
"loss": -0.1661,
|
|
"num_tokens": 13269377.0,
|
|
"reward": 0.3767819404602051,
|
|
"reward_std": 0.46116840839385986,
|
|
"rewards/accuracy_reward_long_step": 0.15625,
|
|
"rewards/final_brier_reward_long_step": 0.11419257521629333,
|
|
"rewards/format_reward_long_step": 0.2890625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.1898101270198822,
|
|
"step": 26
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.14453125,
|
|
"completions/max_length": 989.0,
|
|
"completions/max_terminated_length": 989.0,
|
|
"completions/mean_length": 352.62109375,
|
|
"completions/mean_terminated_length": 412.1963195800781,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 4.0,
|
|
"epoch": 0.0432,
|
|
"grad_norm": 0.18298697471618652,
|
|
"learning_rate": 4.285714285714285e-07,
|
|
"loss": -0.0767,
|
|
"num_tokens": 13776568.0,
|
|
"reward": 0.3685033917427063,
|
|
"reward_std": 0.4383317232131958,
|
|
"rewards/accuracy_reward_long_step": 0.13671875,
|
|
"rewards/final_brier_reward_long_step": 0.11230936646461487,
|
|
"rewards/format_reward_long_step": 0.3046875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.20545418560504913,
|
|
"step": 27
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.15625,
|
|
"completions/max_length": 1013.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 360.98828125,
|
|
"completions/mean_terminated_length": 427.83795166015625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.0448,
|
|
"grad_norm": 0.07104546576738358,
|
|
"learning_rate": 4.444444444444444e-07,
|
|
"loss": -0.134,
|
|
"num_tokens": 14287893.0,
|
|
"reward": 0.41739368438720703,
|
|
"reward_std": 0.5269143581390381,
|
|
"rewards/accuracy_reward_long_step": 0.16796875,
|
|
"rewards/final_brier_reward_long_step": 0.14333046972751617,
|
|
"rewards/format_reward_long_step": 0.31640625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.22155673801898956,
|
|
"step": 28
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 0.0,
|
|
"calib/mean_conf": 0.75,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.140625,
|
|
"completions/max_length": 966.0,
|
|
"completions/max_terminated_length": 966.0,
|
|
"completions/mean_length": 338.6015625,
|
|
"completions/mean_terminated_length": 394.0090637207031,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0464,
|
|
"grad_norm": 0.1085757464170456,
|
|
"learning_rate": 4.6031746031746025e-07,
|
|
"loss": -0.0861,
|
|
"num_tokens": 14807951.0,
|
|
"reward": 0.44378989934921265,
|
|
"reward_std": 0.4950755834579468,
|
|
"rewards/accuracy_reward_long_step": 0.16796875,
|
|
"rewards/final_brier_reward_long_step": 0.13291756808757782,
|
|
"rewards/format_reward_long_step": 0.359375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.2516169548034668,
|
|
"step": 29
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 1.0,
|
|
"calib/mean_conf": 0.95,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.12109375,
|
|
"completions/max_length": 980.0,
|
|
"completions/max_terminated_length": 980.0,
|
|
"completions/mean_length": 353.80078125,
|
|
"completions/mean_terminated_length": 402.5466613769531,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.048,
|
|
"grad_norm": 0.08588235080242157,
|
|
"learning_rate": 4.761904761904761e-07,
|
|
"loss": -0.0824,
|
|
"num_tokens": 15312820.0,
|
|
"reward": 0.5131794214248657,
|
|
"reward_std": 0.5309146046638489,
|
|
"rewards/accuracy_reward_long_step": 0.20703125,
|
|
"rewards/final_brier_reward_long_step": 0.17949271202087402,
|
|
"rewards/format_reward_long_step": 0.390625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.2638500928878784,
|
|
"step": 30
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 1.0,
|
|
"calib/mean_conf": 0.98,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.09375,
|
|
"completions/max_length": 992.0,
|
|
"completions/max_terminated_length": 992.0,
|
|
"completions/mean_length": 355.95703125,
|
|
"completions/mean_terminated_length": 392.7801818847656,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 3.0,
|
|
"epoch": 0.0496,
|
|
"grad_norm": 0.0898517370223999,
|
|
"learning_rate": 4.92063492063492e-07,
|
|
"loss": -0.0724,
|
|
"num_tokens": 15823481.0,
|
|
"reward": 0.5299139618873596,
|
|
"reward_std": 0.57805997133255,
|
|
"rewards/accuracy_reward_long_step": 0.2421875,
|
|
"rewards/final_brier_reward_long_step": 0.17374873161315918,
|
|
"rewards/format_reward_long_step": 0.35546875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.26621949672698975,
|
|
"step": 31
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 0.0,
|
|
"calib/mean_conf": 0.9,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.125,
|
|
"completions/max_length": 1013.0,
|
|
"completions/max_terminated_length": 1013.0,
|
|
"completions/mean_length": 358.1640625,
|
|
"completions/mean_terminated_length": 409.33038330078125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0512,
|
|
"grad_norm": 1.044785737991333,
|
|
"learning_rate": 5.079365079365079e-07,
|
|
"loss": -0.1237,
|
|
"num_tokens": 16325931.0,
|
|
"reward": 0.44143152236938477,
|
|
"reward_std": 0.4683123230934143,
|
|
"rewards/accuracy_reward_long_step": 0.16796875,
|
|
"rewards/final_brier_reward_long_step": 0.14440733194351196,
|
|
"rewards/format_reward_long_step": 0.35546875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.2385062575340271,
|
|
"step": 32
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.125,
|
|
"completions/max_length": 1016.0,
|
|
"completions/max_terminated_length": 1016.0,
|
|
"completions/mean_length": 359.73046875,
|
|
"completions/mean_terminated_length": 411.12054443359375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0528,
|
|
"grad_norm": 0.04461158439517021,
|
|
"learning_rate": 5.238095238095238e-07,
|
|
"loss": -0.0966,
|
|
"num_tokens": 16843398.0,
|
|
"reward": 0.44179078936576843,
|
|
"reward_std": 0.507757306098938,
|
|
"rewards/accuracy_reward_long_step": 0.16796875,
|
|
"rewards/final_brier_reward_long_step": 0.15976552665233612,
|
|
"rewards/format_reward_long_step": 0.36328125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.20896016061306,
|
|
"step": 33
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0078125,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 1.0,
|
|
"calib/mean_conf": 0.96,
|
|
"calib/nonempty_final_conf_rate": 0.0078125,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.010000000000000009,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.09765625,
|
|
"completions/max_length": 995.0,
|
|
"completions/max_terminated_length": 995.0,
|
|
"completions/mean_length": 345.3828125,
|
|
"completions/mean_terminated_length": 382.76190185546875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0544,
|
|
"grad_norm": 0.14507651329040527,
|
|
"learning_rate": 5.396825396825396e-07,
|
|
"loss": -0.1239,
|
|
"num_tokens": 17357632.0,
|
|
"reward": 0.4228193163871765,
|
|
"reward_std": 0.46190011501312256,
|
|
"rewards/accuracy_reward_long_step": 0.125,
|
|
"rewards/final_brier_reward_long_step": 0.15529990196228027,
|
|
"rewards/format_reward_long_step": 0.3828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.2703523635864258,
|
|
"step": 34
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.109375,
|
|
"completions/max_length": 980.0,
|
|
"completions/max_terminated_length": 980.0,
|
|
"completions/mean_length": 342.94140625,
|
|
"completions/mean_terminated_length": 385.0570068359375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 96.0,
|
|
"epoch": 0.056,
|
|
"grad_norm": 0.13200579583644867,
|
|
"learning_rate": 5.555555555555555e-07,
|
|
"loss": -0.0371,
|
|
"num_tokens": 17874505.0,
|
|
"reward": 0.6404584646224976,
|
|
"reward_std": 0.5706257820129395,
|
|
"rewards/accuracy_reward_long_step": 0.29296875,
|
|
"rewards/final_brier_reward_long_step": 0.22589921951293945,
|
|
"rewards/format_reward_long_step": 0.42578125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.31249701976776123,
|
|
"step": 35
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 1.0,
|
|
"calib/mean_conf": 0.97,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.109375,
|
|
"completions/max_length": 955.0,
|
|
"completions/max_terminated_length": 955.0,
|
|
"completions/mean_length": 314.12109375,
|
|
"completions/mean_terminated_length": 352.6973571777344,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 4.0,
|
|
"epoch": 0.0576,
|
|
"grad_norm": 0.15383663773536682,
|
|
"learning_rate": 5.714285714285714e-07,
|
|
"loss": -0.1074,
|
|
"num_tokens": 18373048.0,
|
|
"reward": 0.6416888236999512,
|
|
"reward_std": 0.582075834274292,
|
|
"rewards/accuracy_reward_long_step": 0.234375,
|
|
"rewards/final_brier_reward_long_step": 0.23546718060970306,
|
|
"rewards/format_reward_long_step": 0.5078125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.37816306948661804,
|
|
"step": 36
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 1.0,
|
|
"calib/mean_conf": 0.94,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.11328125,
|
|
"completions/max_length": 987.0,
|
|
"completions/max_terminated_length": 987.0,
|
|
"completions/mean_length": 334.453125,
|
|
"completions/mean_terminated_length": 377.18060302734375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 88.0,
|
|
"epoch": 0.0592,
|
|
"grad_norm": 0.16420908272266388,
|
|
"learning_rate": 5.873015873015873e-07,
|
|
"loss": -0.0809,
|
|
"num_tokens": 18885148.0,
|
|
"reward": 0.6179122924804688,
|
|
"reward_std": 0.5788693428039551,
|
|
"rewards/accuracy_reward_long_step": 0.22265625,
|
|
"rewards/final_brier_reward_long_step": 0.20363515615463257,
|
|
"rewards/format_reward_long_step": 0.49609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.385201632976532,
|
|
"step": 37
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.078125,
|
|
"completions/max_length": 996.0,
|
|
"completions/max_terminated_length": 996.0,
|
|
"completions/mean_length": 345.3828125,
|
|
"completions/mean_terminated_length": 374.6525573730469,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 4.0,
|
|
"epoch": 0.0608,
|
|
"grad_norm": 0.20274563133716583,
|
|
"learning_rate": 6.031746031746031e-07,
|
|
"loss": -0.0143,
|
|
"num_tokens": 19403174.0,
|
|
"reward": 0.5915793180465698,
|
|
"reward_std": 0.5329806804656982,
|
|
"rewards/accuracy_reward_long_step": 0.16015625,
|
|
"rewards/final_brier_reward_long_step": 0.2439812570810318,
|
|
"rewards/format_reward_long_step": 0.5390625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.40358591079711914,
|
|
"step": 38
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0859375,
|
|
"completions/max_length": 918.0,
|
|
"completions/max_terminated_length": 918.0,
|
|
"completions/mean_length": 326.77734375,
|
|
"completions/mean_terminated_length": 357.5000305175781,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 54.0,
|
|
"epoch": 0.0624,
|
|
"grad_norm": 0.3366471827030182,
|
|
"learning_rate": 6.19047619047619e-07,
|
|
"loss": -0.0919,
|
|
"num_tokens": 19907301.0,
|
|
"reward": 0.5409894585609436,
|
|
"reward_std": 0.4594622850418091,
|
|
"rewards/accuracy_reward_long_step": 0.13671875,
|
|
"rewards/final_brier_reward_long_step": 0.20591670274734497,
|
|
"rewards/format_reward_long_step": 0.546875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.31741613149642944,
|
|
"step": 39
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0078125,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 1.0,
|
|
"calib/mean_conf": 0.95,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0078125,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.09765625,
|
|
"completions/max_length": 1015.0,
|
|
"completions/max_terminated_length": 1015.0,
|
|
"completions/mean_length": 326.78515625,
|
|
"completions/mean_terminated_length": 362.1515197753906,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 66.0,
|
|
"epoch": 0.064,
|
|
"grad_norm": 0.1065516546368599,
|
|
"learning_rate": 6.349206349206349e-07,
|
|
"loss": -0.0945,
|
|
"num_tokens": 20406822.0,
|
|
"reward": 0.6141079664230347,
|
|
"reward_std": 0.5281961560249329,
|
|
"rewards/accuracy_reward_long_step": 0.1875,
|
|
"rewards/final_brier_reward_long_step": 0.23533864319324493,
|
|
"rewards/format_reward_long_step": 0.53125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.4085933566093445,
|
|
"step": 40
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 1.0,
|
|
"calib/mean_conf": 0.95,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.125,
|
|
"completions/max_length": 1020.0,
|
|
"completions/max_terminated_length": 1020.0,
|
|
"completions/mean_length": 306.8515625,
|
|
"completions/mean_terminated_length": 350.6875305175781,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 83.0,
|
|
"epoch": 0.0656,
|
|
"grad_norm": 0.21869583427906036,
|
|
"learning_rate": 6.507936507936507e-07,
|
|
"loss": -0.1106,
|
|
"num_tokens": 20919024.0,
|
|
"reward": 0.5630888342857361,
|
|
"reward_std": 0.48765885829925537,
|
|
"rewards/accuracy_reward_long_step": 0.15234375,
|
|
"rewards/final_brier_reward_long_step": 0.21809795498847961,
|
|
"rewards/format_reward_long_step": 0.5234375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.37800735235214233,
|
|
"step": 41
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.09765625,
|
|
"completions/max_length": 956.0,
|
|
"completions/max_terminated_length": 956.0,
|
|
"completions/mean_length": 311.23828125,
|
|
"completions/mean_terminated_length": 344.9220886230469,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 4.0,
|
|
"epoch": 0.0672,
|
|
"grad_norm": 0.12498702108860016,
|
|
"learning_rate": 6.666666666666666e-07,
|
|
"loss": -0.0945,
|
|
"num_tokens": 21415717.0,
|
|
"reward": 0.7428398132324219,
|
|
"reward_std": 0.5535950660705566,
|
|
"rewards/accuracy_reward_long_step": 0.30078125,
|
|
"rewards/final_brier_reward_long_step": 0.27300766110420227,
|
|
"rewards/format_reward_long_step": 0.5625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.37022653222084045,
|
|
"step": 42
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 0.0,
|
|
"calib/mean_conf": 0.8,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.05859375,
|
|
"completions/max_length": 978.0,
|
|
"completions/max_terminated_length": 978.0,
|
|
"completions/mean_length": 316.40234375,
|
|
"completions/mean_terminated_length": 336.095458984375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 25.0,
|
|
"epoch": 0.0688,
|
|
"grad_norm": 0.10794218629598618,
|
|
"learning_rate": 6.825396825396826e-07,
|
|
"loss": -0.0312,
|
|
"num_tokens": 21930028.0,
|
|
"reward": 0.6837334632873535,
|
|
"reward_std": 0.5118536949157715,
|
|
"rewards/accuracy_reward_long_step": 0.2265625,
|
|
"rewards/final_brier_reward_long_step": 0.2446330040693283,
|
|
"rewards/format_reward_long_step": 0.57421875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.43561333417892456,
|
|
"step": 43
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.078125,
|
|
"completions/max_length": 999.0,
|
|
"completions/max_terminated_length": 999.0,
|
|
"completions/mean_length": 312.5703125,
|
|
"completions/mean_terminated_length": 339.059326171875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.0704,
|
|
"grad_norm": 0.08316317200660706,
|
|
"learning_rate": 6.984126984126983e-07,
|
|
"loss": -0.0437,
|
|
"num_tokens": 22445902.0,
|
|
"reward": 0.758315920829773,
|
|
"reward_std": 0.5356731414794922,
|
|
"rewards/accuracy_reward_long_step": 0.28515625,
|
|
"rewards/final_brier_reward_long_step": 0.2901049256324768,
|
|
"rewards/format_reward_long_step": 0.60546875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.39159637689590454,
|
|
"step": 44
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 0.0,
|
|
"calib/mean_conf": 0.85,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.05859375,
|
|
"completions/max_length": 1011.0,
|
|
"completions/max_terminated_length": 1011.0,
|
|
"completions/mean_length": 316.65625,
|
|
"completions/mean_terminated_length": 336.36517333984375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.072,
|
|
"grad_norm": 0.14549246430397034,
|
|
"learning_rate": 7.142857142857143e-07,
|
|
"loss": -0.0613,
|
|
"num_tokens": 22949038.0,
|
|
"reward": 0.7944426536560059,
|
|
"reward_std": 0.5610437393188477,
|
|
"rewards/accuracy_reward_long_step": 0.26953125,
|
|
"rewards/final_brier_reward_long_step": 0.320908784866333,
|
|
"rewards/format_reward_long_step": 0.6484375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.48186174035072327,
|
|
"step": 45
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.05859375,
|
|
"completions/max_length": 960.0,
|
|
"completions/max_terminated_length": 960.0,
|
|
"completions/mean_length": 295.67578125,
|
|
"completions/mean_terminated_length": 314.078857421875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 4.0,
|
|
"epoch": 0.0736,
|
|
"grad_norm": 0.07323121279478073,
|
|
"learning_rate": 7.301587301587301e-07,
|
|
"loss": -0.0735,
|
|
"num_tokens": 23427059.0,
|
|
"reward": 0.6723222732543945,
|
|
"reward_std": 0.4763370752334595,
|
|
"rewards/accuracy_reward_long_step": 0.15234375,
|
|
"rewards/final_brier_reward_long_step": 0.26658162474632263,
|
|
"rewards/format_reward_long_step": 0.6875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.43833261728286743,
|
|
"step": 46
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0078125,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0078125,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 841.0,
|
|
"completions/max_terminated_length": 841.0,
|
|
"completions/mean_length": 306.3046875,
|
|
"completions/mean_terminated_length": 321.36883544921875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 14.0,
|
|
"epoch": 0.0752,
|
|
"grad_norm": 0.2430606335401535,
|
|
"learning_rate": 7.46031746031746e-07,
|
|
"loss": -0.1118,
|
|
"num_tokens": 23915857.0,
|
|
"reward": 0.9021989703178406,
|
|
"reward_std": 0.5503741502761841,
|
|
"rewards/accuracy_reward_long_step": 0.31640625,
|
|
"rewards/final_brier_reward_long_step": 0.36550503969192505,
|
|
"rewards/format_reward_long_step": 0.7265625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.5245407223701477,
|
|
"step": 47
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0390625,
|
|
"completions/max_length": 898.0,
|
|
"completions/max_terminated_length": 898.0,
|
|
"completions/mean_length": 315.328125,
|
|
"completions/mean_terminated_length": 328.1463317871094,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 63.0,
|
|
"epoch": 0.0768,
|
|
"grad_norm": 0.23907212913036346,
|
|
"learning_rate": 7.619047619047618e-07,
|
|
"loss": -0.0704,
|
|
"num_tokens": 24407109.0,
|
|
"reward": 0.8392512798309326,
|
|
"reward_std": 0.5024853944778442,
|
|
"rewards/accuracy_reward_long_step": 0.25390625,
|
|
"rewards/final_brier_reward_long_step": 0.34439170360565186,
|
|
"rewards/format_reward_long_step": 0.71875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.5594882965087891,
|
|
"step": 48
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.05078125,
|
|
"completions/max_length": 821.0,
|
|
"completions/max_terminated_length": 821.0,
|
|
"completions/mean_length": 306.22265625,
|
|
"completions/mean_terminated_length": 322.60491943359375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 78.0,
|
|
"epoch": 0.0784,
|
|
"grad_norm": 0.13986116647720337,
|
|
"learning_rate": 7.777777777777778e-07,
|
|
"loss": -0.0654,
|
|
"num_tokens": 24920814.0,
|
|
"reward": 0.9134366512298584,
|
|
"reward_std": 0.5091613531112671,
|
|
"rewards/accuracy_reward_long_step": 0.2734375,
|
|
"rewards/final_brier_reward_long_step": 0.3957996368408203,
|
|
"rewards/format_reward_long_step": 0.79296875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.5782594680786133,
|
|
"step": 49
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.046875,
|
|
"completions/max_length": 819.0,
|
|
"completions/max_terminated_length": 819.0,
|
|
"completions/mean_length": 281.90625,
|
|
"completions/mean_terminated_length": 295.7704772949219,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 51.0,
|
|
"epoch": 0.08,
|
|
"grad_norm": 0.22414922714233398,
|
|
"learning_rate": 7.936507936507936e-07,
|
|
"loss": -0.0835,
|
|
"num_tokens": 25417334.0,
|
|
"reward": 0.87409508228302,
|
|
"reward_std": 0.49227654933929443,
|
|
"rewards/accuracy_reward_long_step": 0.23046875,
|
|
"rewards/final_brier_reward_long_step": 0.35886436700820923,
|
|
"rewards/format_reward_long_step": 0.80859375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.59845370054245,
|
|
"step": 50
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.00390625,
|
|
"calib/format_rate": 0.0,
|
|
"calib/frac_conf_gt_0.9": 0.0,
|
|
"calib/mean_conf": 0.88,
|
|
"calib/nonempty_final_conf_rate": 0.00390625,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/std_conf": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0234375,
|
|
"completions/max_length": 998.0,
|
|
"completions/max_terminated_length": 998.0,
|
|
"completions/mean_length": 297.73046875,
|
|
"completions/mean_terminated_length": 304.8760070800781,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 69.0,
|
|
"epoch": 0.0816,
|
|
"grad_norm": 0.15504558384418488,
|
|
"learning_rate": 8.095238095238095e-07,
|
|
"loss": -0.0377,
|
|
"num_tokens": 25917625.0,
|
|
"reward": 0.8817519545555115,
|
|
"reward_std": 0.5145280361175537,
|
|
"rewards/accuracy_reward_long_step": 0.2265625,
|
|
"rewards/final_brier_reward_long_step": 0.36166319251060486,
|
|
"rewards/format_reward_long_step": 0.828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6028447151184082,
|
|
"step": 51
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.01953125,
|
|
"completions/max_length": 777.0,
|
|
"completions/max_terminated_length": 777.0,
|
|
"completions/mean_length": 279.953125,
|
|
"completions/mean_terminated_length": 285.5298767089844,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.0832,
|
|
"grad_norm": 0.06322144716978073,
|
|
"learning_rate": 8.253968253968253e-07,
|
|
"loss": -0.0324,
|
|
"num_tokens": 26420901.0,
|
|
"reward": 0.9491708278656006,
|
|
"reward_std": 0.4398835599422455,
|
|
"rewards/accuracy_reward_long_step": 0.25390625,
|
|
"rewards/final_brier_reward_long_step": 0.42088940739631653,
|
|
"rewards/format_reward_long_step": 0.8828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.594543993473053,
|
|
"step": 52
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.02734375,
|
|
"completions/max_length": 888.0,
|
|
"completions/max_terminated_length": 888.0,
|
|
"completions/mean_length": 299.453125,
|
|
"completions/mean_terminated_length": 307.8714599609375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 88.0,
|
|
"epoch": 0.0848,
|
|
"grad_norm": 0.045589037239551544,
|
|
"learning_rate": 8.412698412698413e-07,
|
|
"loss": -0.0128,
|
|
"num_tokens": 26920409.0,
|
|
"reward": 0.8190128803253174,
|
|
"reward_std": 0.3742133677005768,
|
|
"rewards/accuracy_reward_long_step": 0.16015625,
|
|
"rewards/final_brier_reward_long_step": 0.3180277347564697,
|
|
"rewards/format_reward_long_step": 0.84375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6298986673355103,
|
|
"step": 53
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.01171875,
|
|
"completions/max_length": 840.0,
|
|
"completions/max_terminated_length": 840.0,
|
|
"completions/mean_length": 290.125,
|
|
"completions/mean_terminated_length": 293.5652160644531,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 130.0,
|
|
"epoch": 0.0864,
|
|
"grad_norm": 0.2063865065574646,
|
|
"learning_rate": 8.57142857142857e-07,
|
|
"loss": -0.0173,
|
|
"num_tokens": 27414017.0,
|
|
"reward": 0.9507964253425598,
|
|
"reward_std": 0.426396906375885,
|
|
"rewards/accuracy_reward_long_step": 0.25,
|
|
"rewards/final_brier_reward_long_step": 0.38583073019981384,
|
|
"rewards/format_reward_long_step": 0.890625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.636104941368103,
|
|
"step": 54
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.02734375,
|
|
"completions/max_length": 757.0,
|
|
"completions/max_terminated_length": 757.0,
|
|
"completions/mean_length": 291.60546875,
|
|
"completions/mean_terminated_length": 299.8031921386719,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 28.0,
|
|
"epoch": 0.088,
|
|
"grad_norm": 0.12576599419116974,
|
|
"learning_rate": 8.73015873015873e-07,
|
|
"loss": -0.0195,
|
|
"num_tokens": 27925500.0,
|
|
"reward": 1.0254812240600586,
|
|
"reward_std": 0.5260324478149414,
|
|
"rewards/accuracy_reward_long_step": 0.31640625,
|
|
"rewards/final_brier_reward_long_step": 0.45056432485580444,
|
|
"rewards/format_reward_long_step": 0.85546875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.674798309803009,
|
|
"step": 55
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.01171875,
|
|
"completions/max_length": 969.0,
|
|
"completions/max_terminated_length": 969.0,
|
|
"completions/mean_length": 289.62890625,
|
|
"completions/mean_terminated_length": 293.0632629394531,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 97.0,
|
|
"epoch": 0.0896,
|
|
"grad_norm": 0.13417616486549377,
|
|
"learning_rate": 8.888888888888888e-07,
|
|
"loss": -0.0375,
|
|
"num_tokens": 28431053.0,
|
|
"reward": 1.1285545825958252,
|
|
"reward_std": 0.4153571128845215,
|
|
"rewards/accuracy_reward_long_step": 0.38671875,
|
|
"rewards/final_brier_reward_long_step": 0.4956166744232178,
|
|
"rewards/format_reward_long_step": 0.92578125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6201643943786621,
|
|
"step": 56
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0234375,
|
|
"completions/max_length": 763.0,
|
|
"completions/max_terminated_length": 763.0,
|
|
"completions/mean_length": 269.65625,
|
|
"completions/mean_terminated_length": 276.1280212402344,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 15.0,
|
|
"epoch": 0.0912,
|
|
"grad_norm": 0.06763328611850739,
|
|
"learning_rate": 9.047619047619047e-07,
|
|
"loss": -0.0317,
|
|
"num_tokens": 28913637.0,
|
|
"reward": 0.9394167065620422,
|
|
"reward_std": 0.3821975290775299,
|
|
"rewards/accuracy_reward_long_step": 0.2421875,
|
|
"rewards/final_brier_reward_long_step": 0.423524409532547,
|
|
"rewards/format_reward_long_step": 0.91796875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.5294549465179443,
|
|
"step": 57
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.01953125,
|
|
"completions/max_length": 827.0,
|
|
"completions/max_terminated_length": 827.0,
|
|
"completions/mean_length": 271.98828125,
|
|
"completions/mean_terminated_length": 277.4063720703125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.0928,
|
|
"grad_norm": 0.25471770763397217,
|
|
"learning_rate": 9.206349206349205e-07,
|
|
"loss": -0.0193,
|
|
"num_tokens": 29408858.0,
|
|
"reward": 1.0430493354797363,
|
|
"reward_std": 0.45021143555641174,
|
|
"rewards/accuracy_reward_long_step": 0.30078125,
|
|
"rewards/final_brier_reward_long_step": 0.45864561200141907,
|
|
"rewards/format_reward_long_step": 0.90625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6979269981384277,
|
|
"step": 58
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 999.0,
|
|
"completions/max_terminated_length": 999.0,
|
|
"completions/mean_length": 268.2109375,
|
|
"completions/mean_terminated_length": 272.46826171875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 81.0,
|
|
"epoch": 0.0944,
|
|
"grad_norm": 0.08365736901760101,
|
|
"learning_rate": 9.365079365079365e-07,
|
|
"loss": -0.0398,
|
|
"num_tokens": 29906672.0,
|
|
"reward": 0.9959282875061035,
|
|
"reward_std": 0.3980240225791931,
|
|
"rewards/accuracy_reward_long_step": 0.28515625,
|
|
"rewards/final_brier_reward_long_step": 0.4108448326587677,
|
|
"rewards/format_reward_long_step": 0.8984375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6353680491447449,
|
|
"step": 59
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 870.0,
|
|
"completions/max_terminated_length": 870.0,
|
|
"completions/mean_length": 270.6953125,
|
|
"completions/mean_terminated_length": 274.9920654296875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 29.0,
|
|
"epoch": 0.096,
|
|
"grad_norm": 0.037650614976882935,
|
|
"learning_rate": 9.523809523809522e-07,
|
|
"loss": -0.0363,
|
|
"num_tokens": 30394242.0,
|
|
"reward": 0.9999738931655884,
|
|
"reward_std": 0.4165264964103699,
|
|
"rewards/accuracy_reward_long_step": 0.2734375,
|
|
"rewards/final_brier_reward_long_step": 0.44108301401138306,
|
|
"rewards/format_reward_long_step": 0.90234375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.66037517786026,
|
|
"step": 60
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0078125,
|
|
"completions/max_length": 865.0,
|
|
"completions/max_terminated_length": 865.0,
|
|
"completions/mean_length": 265.21875,
|
|
"completions/mean_terminated_length": 267.3070983886719,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.0976,
|
|
"grad_norm": 0.106496162712574,
|
|
"learning_rate": 9.682539682539682e-07,
|
|
"loss": -0.0001,
|
|
"num_tokens": 30893266.0,
|
|
"reward": 1.1621458530426025,
|
|
"reward_std": 0.4202546775341034,
|
|
"rewards/accuracy_reward_long_step": 0.390625,
|
|
"rewards/final_brier_reward_long_step": 0.533796489238739,
|
|
"rewards/format_reward_long_step": 0.94140625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6694742441177368,
|
|
"step": 61
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.01171875,
|
|
"completions/max_length": 643.0,
|
|
"completions/max_terminated_length": 643.0,
|
|
"completions/mean_length": 263.109375,
|
|
"completions/mean_terminated_length": 266.229248046875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.0992,
|
|
"grad_norm": 0.10262063890695572,
|
|
"learning_rate": 9.84126984126984e-07,
|
|
"loss": 0.0064,
|
|
"num_tokens": 31379758.0,
|
|
"reward": 1.0871508121490479,
|
|
"reward_std": 0.3243914842605591,
|
|
"rewards/accuracy_reward_long_step": 0.3203125,
|
|
"rewards/final_brier_reward_long_step": 0.49898362159729004,
|
|
"rewards/format_reward_long_step": 0.94140625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6855573058128357,
|
|
"step": 62
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 797.0,
|
|
"completions/max_terminated_length": 797.0,
|
|
"completions/mean_length": 269.95703125,
|
|
"completions/mean_terminated_length": 271.0157165527344,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 113.0,
|
|
"epoch": 0.1008,
|
|
"grad_norm": 0.045267656445503235,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0051,
|
|
"num_tokens": 31875611.0,
|
|
"reward": 1.0248790979385376,
|
|
"reward_std": 0.3271501064300537,
|
|
"rewards/accuracy_reward_long_step": 0.26171875,
|
|
"rewards/final_brier_reward_long_step": 0.5174949169158936,
|
|
"rewards/format_reward_long_step": 0.9609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6132714152336121,
|
|
"step": 63
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0078125,
|
|
"completions/max_length": 709.0,
|
|
"completions/max_terminated_length": 709.0,
|
|
"completions/mean_length": 243.5,
|
|
"completions/mean_terminated_length": 245.41732788085938,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 74.0,
|
|
"epoch": 0.1024,
|
|
"grad_norm": 0.09396693110466003,
|
|
"learning_rate": 9.98220640569395e-07,
|
|
"loss": -0.0249,
|
|
"num_tokens": 32375531.0,
|
|
"reward": 1.0874074697494507,
|
|
"reward_std": 0.405730664730072,
|
|
"rewards/accuracy_reward_long_step": 0.3046875,
|
|
"rewards/final_brier_reward_long_step": 0.4932839870452881,
|
|
"rewards/format_reward_long_step": 0.94140625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7547836899757385,
|
|
"step": 64
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 603.0,
|
|
"completions/max_terminated_length": 603.0,
|
|
"completions/mean_length": 238.4765625,
|
|
"completions/mean_terminated_length": 242.2619171142578,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 97.0,
|
|
"epoch": 0.104,
|
|
"grad_norm": 0.10586308687925339,
|
|
"learning_rate": 9.9644128113879e-07,
|
|
"loss": -0.0241,
|
|
"num_tokens": 32865229.0,
|
|
"reward": 1.0684640407562256,
|
|
"reward_std": 0.37198448181152344,
|
|
"rewards/accuracy_reward_long_step": 0.30078125,
|
|
"rewards/final_brier_reward_long_step": 0.5142987966537476,
|
|
"rewards/format_reward_long_step": 0.9375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.68143230676651,
|
|
"step": 65
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.01171875,
|
|
"completions/max_length": 795.0,
|
|
"completions/max_terminated_length": 795.0,
|
|
"completions/mean_length": 235.64453125,
|
|
"completions/mean_terminated_length": 238.43875122070312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 82.0,
|
|
"epoch": 0.1056,
|
|
"grad_norm": 0.06174841523170471,
|
|
"learning_rate": 9.94661921708185e-07,
|
|
"loss": -0.0329,
|
|
"num_tokens": 33359506.0,
|
|
"reward": 1.0320240259170532,
|
|
"reward_std": 0.3543888330459595,
|
|
"rewards/accuracy_reward_long_step": 0.28125,
|
|
"rewards/final_brier_reward_long_step": 0.523512601852417,
|
|
"rewards/format_reward_long_step": 0.94140625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.5967710614204407,
|
|
"step": 66
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 627.0,
|
|
"completions/max_terminated_length": 627.0,
|
|
"completions/mean_length": 236.34375,
|
|
"completions/mean_terminated_length": 237.27059936523438,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 0.1072,
|
|
"grad_norm": 0.07356097549200058,
|
|
"learning_rate": 9.9288256227758e-07,
|
|
"loss": -0.0126,
|
|
"num_tokens": 33855346.0,
|
|
"reward": 1.2128088474273682,
|
|
"reward_std": 0.32791173458099365,
|
|
"rewards/accuracy_reward_long_step": 0.3828125,
|
|
"rewards/final_brier_reward_long_step": 0.6027936935424805,
|
|
"rewards/format_reward_long_step": 0.96484375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7875038385391235,
|
|
"step": 67
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 1007.0,
|
|
"completions/max_terminated_length": 1007.0,
|
|
"completions/mean_length": 241.8046875,
|
|
"completions/mean_terminated_length": 242.75296020507812,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 95.0,
|
|
"epoch": 0.1088,
|
|
"grad_norm": 0.09495385736227036,
|
|
"learning_rate": 9.91103202846975e-07,
|
|
"loss": 0.0023,
|
|
"num_tokens": 34342288.0,
|
|
"reward": 1.1574406623840332,
|
|
"reward_std": 0.36004209518432617,
|
|
"rewards/accuracy_reward_long_step": 0.328125,
|
|
"rewards/final_brier_reward_long_step": 0.6040390729904175,
|
|
"rewards/format_reward_long_step": 0.96875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7757238149642944,
|
|
"step": 68
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0078125,
|
|
"completions/max_length": 808.0,
|
|
"completions/max_terminated_length": 808.0,
|
|
"completions/mean_length": 220.72265625,
|
|
"completions/mean_terminated_length": 222.46063232421875,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 100.0,
|
|
"epoch": 0.1104,
|
|
"grad_norm": 0.4110874533653259,
|
|
"learning_rate": 9.8932384341637e-07,
|
|
"loss": 0.009,
|
|
"num_tokens": 34802673.0,
|
|
"reward": 1.3306258916854858,
|
|
"reward_std": 0.33039307594299316,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.6564062833786011,
|
|
"rewards/format_reward_long_step": 0.98046875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7676596641540527,
|
|
"step": 69
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 479.0,
|
|
"completions/max_terminated_length": 479.0,
|
|
"completions/mean_length": 224.27734375,
|
|
"completions/mean_terminated_length": 224.27734375,
|
|
"completions/min_length": 108.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.112,
|
|
"grad_norm": 0.05017966777086258,
|
|
"learning_rate": 9.87544483985765e-07,
|
|
"loss": -0.0149,
|
|
"num_tokens": 35287528.0,
|
|
"reward": 1.2564759254455566,
|
|
"reward_std": 0.27206528186798096,
|
|
"rewards/accuracy_reward_long_step": 0.421875,
|
|
"rewards/final_brier_reward_long_step": 0.6693449020385742,
|
|
"rewards/format_reward_long_step": 0.97265625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7237462401390076,
|
|
"step": 70
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 550.0,
|
|
"completions/max_terminated_length": 550.0,
|
|
"completions/mean_length": 218.94140625,
|
|
"completions/mean_terminated_length": 218.94140625,
|
|
"completions/min_length": 4.0,
|
|
"completions/min_terminated_length": 4.0,
|
|
"epoch": 0.1136,
|
|
"grad_norm": 0.060872942209243774,
|
|
"learning_rate": 9.8576512455516e-07,
|
|
"loss": -0.0093,
|
|
"num_tokens": 35762137.0,
|
|
"reward": 1.3191676139831543,
|
|
"reward_std": 0.35025539994239807,
|
|
"rewards/accuracy_reward_long_step": 0.4609375,
|
|
"rewards/final_brier_reward_long_step": 0.6817148327827454,
|
|
"rewards/format_reward_long_step": 0.984375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7824558615684509,
|
|
"step": 71
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 705.0,
|
|
"completions/max_terminated_length": 705.0,
|
|
"completions/mean_length": 223.96875,
|
|
"completions/mean_terminated_length": 223.96875,
|
|
"completions/min_length": 55.0,
|
|
"completions/min_terminated_length": 55.0,
|
|
"epoch": 0.1152,
|
|
"grad_norm": 0.11113214492797852,
|
|
"learning_rate": 9.83985765124555e-07,
|
|
"loss": 0.0053,
|
|
"num_tokens": 36233089.0,
|
|
"reward": 1.1259610652923584,
|
|
"reward_std": 0.26902925968170166,
|
|
"rewards/accuracy_reward_long_step": 0.29296875,
|
|
"rewards/final_brier_reward_long_step": 0.6245523691177368,
|
|
"rewards/format_reward_long_step": 0.97265625,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7621045112609863,
|
|
"step": 72
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0078125,
|
|
"completions/max_length": 589.0,
|
|
"completions/max_terminated_length": 589.0,
|
|
"completions/mean_length": 214.41015625,
|
|
"completions/mean_terminated_length": 216.09841918945312,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 77.0,
|
|
"epoch": 0.1168,
|
|
"grad_norm": 0.15858663618564606,
|
|
"learning_rate": 9.8220640569395e-07,
|
|
"loss": -0.0158,
|
|
"num_tokens": 36715946.0,
|
|
"reward": 1.0615253448486328,
|
|
"reward_std": 0.2766742706298828,
|
|
"rewards/accuracy_reward_long_step": 0.234375,
|
|
"rewards/final_brier_reward_long_step": 0.6290937662124634,
|
|
"rewards/format_reward_long_step": 0.96484375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7498202919960022,
|
|
"step": 73
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 600.0,
|
|
"completions/max_terminated_length": 600.0,
|
|
"completions/mean_length": 214.25390625,
|
|
"completions/mean_terminated_length": 214.25390625,
|
|
"completions/min_length": 113.0,
|
|
"completions/min_terminated_length": 113.0,
|
|
"epoch": 0.1184,
|
|
"grad_norm": 0.5460712909698486,
|
|
"learning_rate": 9.804270462633451e-07,
|
|
"loss": -0.011,
|
|
"num_tokens": 37180563.0,
|
|
"reward": 1.268796443939209,
|
|
"reward_std": 0.3486781716346741,
|
|
"rewards/accuracy_reward_long_step": 0.40625,
|
|
"rewards/final_brier_reward_long_step": 0.6875852346420288,
|
|
"rewards/format_reward_long_step": 0.98046875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8016629219055176,
|
|
"step": 74
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0078125,
|
|
"completions/max_length": 712.0,
|
|
"completions/max_terminated_length": 712.0,
|
|
"completions/mean_length": 208.28515625,
|
|
"completions/mean_terminated_length": 209.92520141601562,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 0.12,
|
|
"grad_norm": 0.06162435933947563,
|
|
"learning_rate": 9.786476868327401e-07,
|
|
"loss": -0.0327,
|
|
"num_tokens": 37667916.0,
|
|
"reward": 1.14532470703125,
|
|
"reward_std": 0.30212295055389404,
|
|
"rewards/accuracy_reward_long_step": 0.30859375,
|
|
"rewards/final_brier_reward_long_step": 0.6789199113845825,
|
|
"rewards/format_reward_long_step": 0.96484375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7383161187171936,
|
|
"step": 75
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 585.0,
|
|
"completions/max_terminated_length": 585.0,
|
|
"completions/mean_length": 216.5234375,
|
|
"completions/mean_terminated_length": 216.5234375,
|
|
"completions/min_length": 86.0,
|
|
"completions/min_terminated_length": 86.0,
|
|
"epoch": 0.1216,
|
|
"grad_norm": 0.07474726438522339,
|
|
"learning_rate": 9.768683274021351e-07,
|
|
"loss": 0.0095,
|
|
"num_tokens": 38143594.0,
|
|
"reward": 1.2285196781158447,
|
|
"reward_std": 0.2515240013599396,
|
|
"rewards/accuracy_reward_long_step": 0.3671875,
|
|
"rewards/final_brier_reward_long_step": 0.715578556060791,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7375626564025879,
|
|
"step": 76
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 573.0,
|
|
"completions/max_terminated_length": 573.0,
|
|
"completions/mean_length": 209.671875,
|
|
"completions/mean_terminated_length": 209.671875,
|
|
"completions/min_length": 83.0,
|
|
"completions/min_terminated_length": 83.0,
|
|
"epoch": 0.1232,
|
|
"grad_norm": 0.04926946386694908,
|
|
"learning_rate": 9.750889679715302e-07,
|
|
"loss": 0.0161,
|
|
"num_tokens": 38622302.0,
|
|
"reward": 1.2208093404769897,
|
|
"reward_std": 0.23148852586746216,
|
|
"rewards/accuracy_reward_long_step": 0.34375,
|
|
"rewards/final_brier_reward_long_step": 0.7191964387893677,
|
|
"rewards/format_reward_long_step": 0.9921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8046656250953674,
|
|
"step": 77
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 500.0,
|
|
"completions/max_terminated_length": 500.0,
|
|
"completions/mean_length": 205.0,
|
|
"completions/mean_terminated_length": 205.0,
|
|
"completions/min_length": 62.0,
|
|
"completions/min_terminated_length": 62.0,
|
|
"epoch": 0.1248,
|
|
"grad_norm": 0.05082042142748833,
|
|
"learning_rate": 9.733096085409252e-07,
|
|
"loss": -0.0045,
|
|
"num_tokens": 39095686.0,
|
|
"reward": 1.1963913440704346,
|
|
"reward_std": 0.2708578109741211,
|
|
"rewards/accuracy_reward_long_step": 0.33203125,
|
|
"rewards/final_brier_reward_long_step": 0.7243698239326477,
|
|
"rewards/format_reward_long_step": 0.98828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7565078735351562,
|
|
"step": 78
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 692.0,
|
|
"completions/max_terminated_length": 692.0,
|
|
"completions/mean_length": 209.58203125,
|
|
"completions/mean_terminated_length": 209.58203125,
|
|
"completions/min_length": 97.0,
|
|
"completions/min_terminated_length": 97.0,
|
|
"epoch": 0.1264,
|
|
"grad_norm": 0.05406387895345688,
|
|
"learning_rate": 9.715302491103202e-07,
|
|
"loss": -0.0165,
|
|
"num_tokens": 39573555.0,
|
|
"reward": 1.2039846181869507,
|
|
"reward_std": 0.21231237053871155,
|
|
"rewards/accuracy_reward_long_step": 0.328125,
|
|
"rewards/final_brier_reward_long_step": 0.7417787313461304,
|
|
"rewards/format_reward_long_step": 0.98828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7850972414016724,
|
|
"step": 79
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 389.0,
|
|
"completions/max_terminated_length": 389.0,
|
|
"completions/mean_length": 200.73046875,
|
|
"completions/mean_terminated_length": 200.73046875,
|
|
"completions/min_length": 94.0,
|
|
"completions/min_terminated_length": 94.0,
|
|
"epoch": 0.128,
|
|
"grad_norm": 0.03713912516832352,
|
|
"learning_rate": 9.697508896797152e-07,
|
|
"loss": -0.0034,
|
|
"num_tokens": 40051286.0,
|
|
"reward": 1.1338202953338623,
|
|
"reward_std": 0.22740787267684937,
|
|
"rewards/accuracy_reward_long_step": 0.25390625,
|
|
"rewards/final_brier_reward_long_step": 0.7587928771972656,
|
|
"rewards/format_reward_long_step": 0.98828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7843012809753418,
|
|
"step": 80
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 581.0,
|
|
"completions/max_terminated_length": 581.0,
|
|
"completions/mean_length": 197.05859375,
|
|
"completions/mean_terminated_length": 197.05859375,
|
|
"completions/min_length": 90.0,
|
|
"completions/min_terminated_length": 90.0,
|
|
"epoch": 0.1296,
|
|
"grad_norm": 0.13255728781223297,
|
|
"learning_rate": 9.679715302491102e-07,
|
|
"loss": -0.0323,
|
|
"num_tokens": 40525805.0,
|
|
"reward": 1.2546930313110352,
|
|
"reward_std": 0.29959502816200256,
|
|
"rewards/accuracy_reward_long_step": 0.37890625,
|
|
"rewards/final_brier_reward_long_step": 0.7437311410903931,
|
|
"rewards/format_reward_long_step": 0.9921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7750412225723267,
|
|
"step": 81
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 397.0,
|
|
"completions/max_terminated_length": 397.0,
|
|
"completions/mean_length": 185.1015625,
|
|
"completions/mean_terminated_length": 185.1015625,
|
|
"completions/min_length": 91.0,
|
|
"completions/min_terminated_length": 91.0,
|
|
"epoch": 0.1312,
|
|
"grad_norm": 0.03457874432206154,
|
|
"learning_rate": 9.661921708185054e-07,
|
|
"loss": 0.0033,
|
|
"num_tokens": 41000735.0,
|
|
"reward": 1.2457207441329956,
|
|
"reward_std": 0.1934887319803238,
|
|
"rewards/accuracy_reward_long_step": 0.359375,
|
|
"rewards/final_brier_reward_long_step": 0.7806586027145386,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7647244334220886,
|
|
"step": 82
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 452.0,
|
|
"completions/max_terminated_length": 452.0,
|
|
"completions/mean_length": 193.9140625,
|
|
"completions/mean_terminated_length": 193.9140625,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.1328,
|
|
"grad_norm": 0.04606304317712784,
|
|
"learning_rate": 9.644128113879002e-07,
|
|
"loss": -0.0082,
|
|
"num_tokens": 41472457.0,
|
|
"reward": 1.2768468856811523,
|
|
"reward_std": 0.25950515270233154,
|
|
"rewards/accuracy_reward_long_step": 0.40625,
|
|
"rewards/final_brier_reward_long_step": 0.7273233532905579,
|
|
"rewards/format_reward_long_step": 0.98828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7785016894340515,
|
|
"step": 83
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 564.0,
|
|
"completions/max_terminated_length": 564.0,
|
|
"completions/mean_length": 187.13671875,
|
|
"completions/mean_terminated_length": 187.13671875,
|
|
"completions/min_length": 92.0,
|
|
"completions/min_terminated_length": 92.0,
|
|
"epoch": 0.1344,
|
|
"grad_norm": 0.04770605266094208,
|
|
"learning_rate": 9.626334519572953e-07,
|
|
"loss": 0.003,
|
|
"num_tokens": 41954644.0,
|
|
"reward": 1.3016421794891357,
|
|
"reward_std": 0.2636979818344116,
|
|
"rewards/accuracy_reward_long_step": 0.43359375,
|
|
"rewards/final_brier_reward_long_step": 0.7189725637435913,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7610331773757935,
|
|
"step": 84
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 380.0,
|
|
"completions/max_terminated_length": 380.0,
|
|
"completions/mean_length": 185.37109375,
|
|
"completions/mean_terminated_length": 186.09805297851562,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 100.0,
|
|
"epoch": 0.136,
|
|
"grad_norm": 0.038233935832977295,
|
|
"learning_rate": 9.608540925266903e-07,
|
|
"loss": -0.0246,
|
|
"num_tokens": 42421019.0,
|
|
"reward": 1.3664637804031372,
|
|
"reward_std": 0.2260427474975586,
|
|
"rewards/accuracy_reward_long_step": 0.5078125,
|
|
"rewards/final_brier_reward_long_step": 0.6927652359008789,
|
|
"rewards/format_reward_long_step": 0.98828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.765277624130249,
|
|
"step": 85
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 552.0,
|
|
"completions/max_terminated_length": 552.0,
|
|
"completions/mean_length": 192.4375,
|
|
"completions/mean_terminated_length": 192.4375,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.1376,
|
|
"grad_norm": 0.031944431364536285,
|
|
"learning_rate": 9.590747330960853e-07,
|
|
"loss": -0.0075,
|
|
"num_tokens": 42885091.0,
|
|
"reward": 1.2885265350341797,
|
|
"reward_std": 0.21891814470291138,
|
|
"rewards/accuracy_reward_long_step": 0.41796875,
|
|
"rewards/final_brier_reward_long_step": 0.7211390733718872,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7689046859741211,
|
|
"step": 86
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 505.0,
|
|
"completions/max_terminated_length": 505.0,
|
|
"completions/mean_length": 192.12890625,
|
|
"completions/mean_terminated_length": 192.12890625,
|
|
"completions/min_length": 81.0,
|
|
"completions/min_terminated_length": 81.0,
|
|
"epoch": 0.1392,
|
|
"grad_norm": 0.053836286067962646,
|
|
"learning_rate": 9.572953736654805e-07,
|
|
"loss": 0.0032,
|
|
"num_tokens": 43357156.0,
|
|
"reward": 1.3813579082489014,
|
|
"reward_std": 0.25391727685928345,
|
|
"rewards/accuracy_reward_long_step": 0.546875,
|
|
"rewards/final_brier_reward_long_step": 0.6488757729530334,
|
|
"rewards/format_reward_long_step": 0.98828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7124930620193481,
|
|
"step": 87
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 579.0,
|
|
"completions/max_terminated_length": 579.0,
|
|
"completions/mean_length": 188.6640625,
|
|
"completions/mean_terminated_length": 188.6640625,
|
|
"completions/min_length": 87.0,
|
|
"completions/min_terminated_length": 87.0,
|
|
"epoch": 0.1408,
|
|
"grad_norm": 0.04048394784331322,
|
|
"learning_rate": 9.555160142348753e-07,
|
|
"loss": -0.0004,
|
|
"num_tokens": 43821726.0,
|
|
"reward": 1.2759735584259033,
|
|
"reward_std": 0.23172587156295776,
|
|
"rewards/accuracy_reward_long_step": 0.40234375,
|
|
"rewards/final_brier_reward_long_step": 0.7274124622344971,
|
|
"rewards/format_reward_long_step": 0.98828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7905445694923401,
|
|
"step": 88
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 534.0,
|
|
"completions/max_terminated_length": 534.0,
|
|
"completions/mean_length": 190.31640625,
|
|
"completions/mean_terminated_length": 190.31640625,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.1424,
|
|
"grad_norm": 0.035542842000722885,
|
|
"learning_rate": 9.537366548042705e-07,
|
|
"loss": 0.0089,
|
|
"num_tokens": 44299575.0,
|
|
"reward": 1.2254152297973633,
|
|
"reward_std": 0.2625795304775238,
|
|
"rewards/accuracy_reward_long_step": 0.359375,
|
|
"rewards/final_brier_reward_long_step": 0.7171218395233154,
|
|
"rewards/format_reward_long_step": 0.9921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7626639604568481,
|
|
"step": 89
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 439.0,
|
|
"completions/max_terminated_length": 439.0,
|
|
"completions/mean_length": 186.6171875,
|
|
"completions/mean_terminated_length": 186.6171875,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.144,
|
|
"grad_norm": 0.04348074272274971,
|
|
"learning_rate": 9.519572953736655e-07,
|
|
"loss": -0.0097,
|
|
"num_tokens": 44766493.0,
|
|
"reward": 1.2421379089355469,
|
|
"reward_std": 0.2381449192762375,
|
|
"rewards/accuracy_reward_long_step": 0.37109375,
|
|
"rewards/final_brier_reward_long_step": 0.7263898849487305,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7577868700027466,
|
|
"step": 90
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 527.0,
|
|
"completions/max_terminated_length": 527.0,
|
|
"completions/mean_length": 193.671875,
|
|
"completions/mean_terminated_length": 194.43138122558594,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.1456,
|
|
"grad_norm": 0.10439097136259079,
|
|
"learning_rate": 9.501779359430605e-07,
|
|
"loss": 0.014,
|
|
"num_tokens": 45226529.0,
|
|
"reward": 1.2801823616027832,
|
|
"reward_std": 0.22653043270111084,
|
|
"rewards/accuracy_reward_long_step": 0.42578125,
|
|
"rewards/final_brier_reward_long_step": 0.6875629425048828,
|
|
"rewards/format_reward_long_step": 0.98828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7534794807434082,
|
|
"step": 91
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.00390625,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.00390625,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 472.0,
|
|
"completions/max_terminated_length": 472.0,
|
|
"completions/mean_length": 192.19921875,
|
|
"completions/mean_terminated_length": 192.19921875,
|
|
"completions/min_length": 68.0,
|
|
"completions/min_terminated_length": 68.0,
|
|
"epoch": 0.1472,
|
|
"grad_norm": 0.038028016686439514,
|
|
"learning_rate": 9.483985765124555e-07,
|
|
"loss": 0.0038,
|
|
"num_tokens": 45698404.0,
|
|
"reward": 1.2324891090393066,
|
|
"reward_std": 0.2521362900733948,
|
|
"rewards/accuracy_reward_long_step": 0.37890625,
|
|
"rewards/final_brier_reward_long_step": 0.7127734422683716,
|
|
"rewards/format_reward_long_step": 0.9921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7171825766563416,
|
|
"step": 92
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 394.0,
|
|
"completions/max_terminated_length": 394.0,
|
|
"completions/mean_length": 186.05859375,
|
|
"completions/mean_terminated_length": 186.05859375,
|
|
"completions/min_length": 95.0,
|
|
"completions/min_terminated_length": 95.0,
|
|
"epoch": 0.1488,
|
|
"grad_norm": 0.0344776026904583,
|
|
"learning_rate": 9.466192170818504e-07,
|
|
"loss": 0.0051,
|
|
"num_tokens": 46169131.0,
|
|
"reward": 1.2282606363296509,
|
|
"reward_std": 0.16655448079109192,
|
|
"rewards/accuracy_reward_long_step": 0.359375,
|
|
"rewards/final_brier_reward_long_step": 0.7309889793395996,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7523662447929382,
|
|
"step": 93
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 434.0,
|
|
"completions/max_terminated_length": 434.0,
|
|
"completions/mean_length": 190.9765625,
|
|
"completions/mean_terminated_length": 190.9765625,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.1504,
|
|
"grad_norm": 0.03720776364207268,
|
|
"learning_rate": 9.448398576512455e-07,
|
|
"loss": 0.0102,
|
|
"num_tokens": 46636349.0,
|
|
"reward": 1.2539737224578857,
|
|
"reward_std": 0.2229781448841095,
|
|
"rewards/accuracy_reward_long_step": 0.37890625,
|
|
"rewards/final_brier_reward_long_step": 0.7260522246360779,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7742174863815308,
|
|
"step": 94
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 563.0,
|
|
"completions/max_terminated_length": 563.0,
|
|
"completions/mean_length": 191.19921875,
|
|
"completions/mean_terminated_length": 191.94903564453125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 94.0,
|
|
"epoch": 0.152,
|
|
"grad_norm": 0.03717532381415367,
|
|
"learning_rate": 9.430604982206405e-07,
|
|
"loss": -0.0118,
|
|
"num_tokens": 47117128.0,
|
|
"reward": 1.2085305452346802,
|
|
"reward_std": 0.22574907541275024,
|
|
"rewards/accuracy_reward_long_step": 0.3359375,
|
|
"rewards/final_brier_reward_long_step": 0.7332504391670227,
|
|
"rewards/format_reward_long_step": 0.98828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7805593013763428,
|
|
"step": 95
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 707.0,
|
|
"completions/max_terminated_length": 707.0,
|
|
"completions/mean_length": 193.6953125,
|
|
"completions/mean_terminated_length": 193.6953125,
|
|
"completions/min_length": 78.0,
|
|
"completions/min_terminated_length": 78.0,
|
|
"epoch": 0.1536,
|
|
"grad_norm": 0.03478072211146355,
|
|
"learning_rate": 9.412811387900355e-07,
|
|
"loss": -0.0084,
|
|
"num_tokens": 47604330.0,
|
|
"reward": 1.3053498268127441,
|
|
"reward_std": 0.20083755254745483,
|
|
"rewards/accuracy_reward_long_step": 0.453125,
|
|
"rewards/final_brier_reward_long_step": 0.6733219027519226,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7355772256851196,
|
|
"step": 96
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 397.0,
|
|
"completions/max_terminated_length": 397.0,
|
|
"completions/mean_length": 196.6015625,
|
|
"completions/mean_terminated_length": 196.6015625,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.1552,
|
|
"grad_norm": 0.040781840682029724,
|
|
"learning_rate": 9.395017793594306e-07,
|
|
"loss": -0.0066,
|
|
"num_tokens": 48071700.0,
|
|
"reward": 1.2769510746002197,
|
|
"reward_std": 0.18185681104660034,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.7033559679985046,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7560105919837952,
|
|
"step": 97
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 510.0,
|
|
"completions/max_terminated_length": 510.0,
|
|
"completions/mean_length": 184.20703125,
|
|
"completions/mean_terminated_length": 184.20703125,
|
|
"completions/min_length": 97.0,
|
|
"completions/min_terminated_length": 97.0,
|
|
"epoch": 0.1568,
|
|
"grad_norm": 0.04920961335301399,
|
|
"learning_rate": 9.377224199288256e-07,
|
|
"loss": -0.0063,
|
|
"num_tokens": 48530185.0,
|
|
"reward": 1.2058653831481934,
|
|
"reward_std": 0.1759049892425537,
|
|
"rewards/accuracy_reward_long_step": 0.32421875,
|
|
"rewards/final_brier_reward_long_step": 0.7487024068832397,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7856966853141785,
|
|
"step": 98
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 381.0,
|
|
"completions/max_terminated_length": 381.0,
|
|
"completions/mean_length": 186.4609375,
|
|
"completions/mean_terminated_length": 186.4609375,
|
|
"completions/min_length": 76.0,
|
|
"completions/min_terminated_length": 76.0,
|
|
"epoch": 0.1584,
|
|
"grad_norm": 0.03607820346951485,
|
|
"learning_rate": 9.359430604982206e-07,
|
|
"loss": 0.002,
|
|
"num_tokens": 49009167.0,
|
|
"reward": 1.2021329402923584,
|
|
"reward_std": 0.17198419570922852,
|
|
"rewards/accuracy_reward_long_step": 0.30859375,
|
|
"rewards/final_brier_reward_long_step": 0.7756035327911377,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7985529899597168,
|
|
"step": 99
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 404.0,
|
|
"completions/max_terminated_length": 404.0,
|
|
"completions/mean_length": 193.265625,
|
|
"completions/mean_terminated_length": 193.265625,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.16,
|
|
"grad_norm": 0.03377021104097366,
|
|
"learning_rate": 9.341637010676157e-07,
|
|
"loss": 0.0073,
|
|
"num_tokens": 49501779.0,
|
|
"reward": 1.2661197185516357,
|
|
"reward_std": 0.1904180347919464,
|
|
"rewards/accuracy_reward_long_step": 0.40625,
|
|
"rewards/final_brier_reward_long_step": 0.6986390352249146,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7408397197723389,
|
|
"step": 100
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 703.0,
|
|
"completions/max_terminated_length": 703.0,
|
|
"completions/mean_length": 191.34765625,
|
|
"completions/mean_terminated_length": 192.09805297851562,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 92.0,
|
|
"epoch": 0.1616,
|
|
"grad_norm": 0.04310128837823868,
|
|
"learning_rate": 9.323843416370106e-07,
|
|
"loss": -0.0187,
|
|
"num_tokens": 49985236.0,
|
|
"reward": 1.2589505910873413,
|
|
"reward_std": 0.15486913919448853,
|
|
"rewards/accuracy_reward_long_step": 0.38671875,
|
|
"rewards/final_brier_reward_long_step": 0.7101609110832214,
|
|
"rewards/format_reward_long_step": 0.9921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7943914532661438,
|
|
"step": 101
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 716.0,
|
|
"completions/max_terminated_length": 716.0,
|
|
"completions/mean_length": 197.26953125,
|
|
"completions/mean_terminated_length": 197.26953125,
|
|
"completions/min_length": 100.0,
|
|
"completions/min_terminated_length": 100.0,
|
|
"epoch": 0.1632,
|
|
"grad_norm": 0.06449352204799652,
|
|
"learning_rate": 9.306049822064056e-07,
|
|
"loss": -0.0023,
|
|
"num_tokens": 50458833.0,
|
|
"reward": 1.2103374004364014,
|
|
"reward_std": 0.17173272371292114,
|
|
"rewards/accuracy_reward_long_step": 0.33203125,
|
|
"rewards/final_brier_reward_long_step": 0.7552437782287598,
|
|
"rewards/format_reward_long_step": 0.9921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7736056447029114,
|
|
"step": 102
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 449.0,
|
|
"completions/max_terminated_length": 449.0,
|
|
"completions/mean_length": 196.171875,
|
|
"completions/mean_terminated_length": 196.171875,
|
|
"completions/min_length": 109.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 0.1648,
|
|
"grad_norm": 0.03428987041115761,
|
|
"learning_rate": 9.288256227758006e-07,
|
|
"loss": -0.0075,
|
|
"num_tokens": 50948197.0,
|
|
"reward": 1.2849457263946533,
|
|
"reward_std": 0.22478044033050537,
|
|
"rewards/accuracy_reward_long_step": 0.43359375,
|
|
"rewards/final_brier_reward_long_step": 0.6716355085372925,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7415850758552551,
|
|
"step": 103
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 388.0,
|
|
"completions/max_terminated_length": 388.0,
|
|
"completions/mean_length": 189.1171875,
|
|
"completions/mean_terminated_length": 189.1171875,
|
|
"completions/min_length": 100.0,
|
|
"completions/min_terminated_length": 100.0,
|
|
"epoch": 0.1664,
|
|
"grad_norm": 0.03708449751138687,
|
|
"learning_rate": 9.270462633451957e-07,
|
|
"loss": 0.0166,
|
|
"num_tokens": 51417579.0,
|
|
"reward": 1.3035290241241455,
|
|
"reward_std": 0.2233991026878357,
|
|
"rewards/accuracy_reward_long_step": 0.4453125,
|
|
"rewards/final_brier_reward_long_step": 0.6784660220146179,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7544001936912537,
|
|
"step": 104
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 467.0,
|
|
"completions/max_terminated_length": 467.0,
|
|
"completions/mean_length": 199.61328125,
|
|
"completions/mean_terminated_length": 199.61328125,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.168,
|
|
"grad_norm": 0.04729332774877548,
|
|
"learning_rate": 9.252669039145908e-07,
|
|
"loss": -0.0052,
|
|
"num_tokens": 51903344.0,
|
|
"reward": 1.32490873336792,
|
|
"reward_std": 0.23365336656570435,
|
|
"rewards/accuracy_reward_long_step": 0.47265625,
|
|
"rewards/final_brier_reward_long_step": 0.6609004139900208,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7559216618537903,
|
|
"step": 105
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 434.0,
|
|
"completions/max_terminated_length": 434.0,
|
|
"completions/mean_length": 199.90625,
|
|
"completions/mean_terminated_length": 199.90625,
|
|
"completions/min_length": 98.0,
|
|
"completions/min_terminated_length": 98.0,
|
|
"epoch": 0.1696,
|
|
"grad_norm": 0.03533056378364563,
|
|
"learning_rate": 9.234875444839857e-07,
|
|
"loss": 0.0067,
|
|
"num_tokens": 52395024.0,
|
|
"reward": 1.1930384635925293,
|
|
"reward_std": 0.1815432459115982,
|
|
"rewards/accuracy_reward_long_step": 0.3203125,
|
|
"rewards/final_brier_reward_long_step": 0.7636566162109375,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7272477149963379,
|
|
"step": 106
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 354.0,
|
|
"completions/max_terminated_length": 354.0,
|
|
"completions/mean_length": 190.7265625,
|
|
"completions/mean_terminated_length": 190.7265625,
|
|
"completions/min_length": 106.0,
|
|
"completions/min_terminated_length": 106.0,
|
|
"epoch": 0.1712,
|
|
"grad_norm": 0.03335587680339813,
|
|
"learning_rate": 9.217081850533808e-07,
|
|
"loss": 0.0079,
|
|
"num_tokens": 52879418.0,
|
|
"reward": 1.2246638536453247,
|
|
"reward_std": 0.22744205594062805,
|
|
"rewards/accuracy_reward_long_step": 0.359375,
|
|
"rewards/final_brier_reward_long_step": 0.7271843552589417,
|
|
"rewards/format_reward_long_step": 0.9921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7495959997177124,
|
|
"step": 107
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 590.0,
|
|
"completions/max_terminated_length": 590.0,
|
|
"completions/mean_length": 195.2109375,
|
|
"completions/mean_terminated_length": 195.2109375,
|
|
"completions/min_length": 89.0,
|
|
"completions/min_terminated_length": 89.0,
|
|
"epoch": 0.1728,
|
|
"grad_norm": 0.03303585201501846,
|
|
"learning_rate": 9.199288256227757e-07,
|
|
"loss": 0.0029,
|
|
"num_tokens": 53356184.0,
|
|
"reward": 1.2511444091796875,
|
|
"reward_std": 0.16916480660438538,
|
|
"rewards/accuracy_reward_long_step": 0.37890625,
|
|
"rewards/final_brier_reward_long_step": 0.7238613367080688,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7650911808013916,
|
|
"step": 108
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 534.0,
|
|
"completions/max_terminated_length": 534.0,
|
|
"completions/mean_length": 194.62890625,
|
|
"completions/mean_terminated_length": 194.62890625,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.1744,
|
|
"grad_norm": 0.03541121259331703,
|
|
"learning_rate": 9.181494661921708e-07,
|
|
"loss": -0.0096,
|
|
"num_tokens": 53823321.0,
|
|
"reward": 1.2710667848587036,
|
|
"reward_std": 0.17301318049430847,
|
|
"rewards/accuracy_reward_long_step": 0.41015625,
|
|
"rewards/final_brier_reward_long_step": 0.6999242305755615,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7437179684638977,
|
|
"step": 109
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 431.0,
|
|
"completions/max_terminated_length": 431.0,
|
|
"completions/mean_length": 197.36328125,
|
|
"completions/mean_terminated_length": 197.36328125,
|
|
"completions/min_length": 98.0,
|
|
"completions/min_terminated_length": 98.0,
|
|
"epoch": 0.176,
|
|
"grad_norm": 0.032131701707839966,
|
|
"learning_rate": 9.163701067615657e-07,
|
|
"loss": 0.0068,
|
|
"num_tokens": 54302126.0,
|
|
"reward": 1.3117148876190186,
|
|
"reward_std": 0.232276052236557,
|
|
"rewards/accuracy_reward_long_step": 0.45703125,
|
|
"rewards/final_brier_reward_long_step": 0.6579011678695679,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7686457633972168,
|
|
"step": 110
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 508.0,
|
|
"completions/max_terminated_length": 508.0,
|
|
"completions/mean_length": 199.5078125,
|
|
"completions/mean_terminated_length": 199.5078125,
|
|
"completions/min_length": 97.0,
|
|
"completions/min_terminated_length": 97.0,
|
|
"epoch": 0.1776,
|
|
"grad_norm": 0.03558618575334549,
|
|
"learning_rate": 9.145907473309609e-07,
|
|
"loss": -0.0095,
|
|
"num_tokens": 54778832.0,
|
|
"reward": 1.2715282440185547,
|
|
"reward_std": 0.17849522829055786,
|
|
"rewards/accuracy_reward_long_step": 0.40234375,
|
|
"rewards/final_brier_reward_long_step": 0.7140308618545532,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7627072930335999,
|
|
"step": 111
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 399.0,
|
|
"completions/max_terminated_length": 399.0,
|
|
"completions/mean_length": 193.99609375,
|
|
"completions/mean_terminated_length": 193.99609375,
|
|
"completions/min_length": 86.0,
|
|
"completions/min_terminated_length": 86.0,
|
|
"epoch": 0.1792,
|
|
"grad_norm": 0.048463039100170135,
|
|
"learning_rate": 9.128113879003559e-07,
|
|
"loss": -0.0061,
|
|
"num_tokens": 55260847.0,
|
|
"reward": 1.2326127290725708,
|
|
"reward_std": 0.10357346385717392,
|
|
"rewards/accuracy_reward_long_step": 0.35546875,
|
|
"rewards/final_brier_reward_long_step": 0.7353038787841797,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.773271918296814,
|
|
"step": 112
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 521.0,
|
|
"completions/max_terminated_length": 521.0,
|
|
"completions/mean_length": 190.48828125,
|
|
"completions/mean_terminated_length": 190.48828125,
|
|
"completions/min_length": 101.0,
|
|
"completions/min_terminated_length": 101.0,
|
|
"epoch": 0.1808,
|
|
"grad_norm": 0.036182910203933716,
|
|
"learning_rate": 9.110320284697508e-07,
|
|
"loss": -0.0012,
|
|
"num_tokens": 55716884.0,
|
|
"reward": 1.3314650058746338,
|
|
"reward_std": 0.21325276792049408,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.653796911239624,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.750187873840332,
|
|
"step": 113
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 422.0,
|
|
"completions/max_terminated_length": 422.0,
|
|
"completions/mean_length": 197.38671875,
|
|
"completions/mean_terminated_length": 197.38671875,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.1824,
|
|
"grad_norm": 0.0341506227850914,
|
|
"learning_rate": 9.092526690391459e-07,
|
|
"loss": -0.0149,
|
|
"num_tokens": 56184679.0,
|
|
"reward": 1.3492083549499512,
|
|
"reward_std": 0.1791898012161255,
|
|
"rewards/accuracy_reward_long_step": 0.4921875,
|
|
"rewards/final_brier_reward_long_step": 0.6604753732681274,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7676081657409668,
|
|
"step": 114
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 349.0,
|
|
"completions/max_terminated_length": 349.0,
|
|
"completions/mean_length": 203.13671875,
|
|
"completions/mean_terminated_length": 203.13671875,
|
|
"completions/min_length": 95.0,
|
|
"completions/min_terminated_length": 95.0,
|
|
"epoch": 0.184,
|
|
"grad_norm": 0.03318583592772484,
|
|
"learning_rate": 9.074733096085408e-07,
|
|
"loss": 0.012,
|
|
"num_tokens": 56663218.0,
|
|
"reward": 1.3214986324310303,
|
|
"reward_std": 0.15533313155174255,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.6525156497955322,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7350417375564575,
|
|
"step": 115
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 450.0,
|
|
"completions/max_terminated_length": 450.0,
|
|
"completions/mean_length": 196.93359375,
|
|
"completions/mean_terminated_length": 196.93359375,
|
|
"completions/min_length": 9.0,
|
|
"completions/min_terminated_length": 9.0,
|
|
"epoch": 0.1856,
|
|
"grad_norm": 0.04418765380978584,
|
|
"learning_rate": 9.056939501779359e-07,
|
|
"loss": -0.0076,
|
|
"num_tokens": 57139865.0,
|
|
"reward": 1.3460896015167236,
|
|
"reward_std": 0.1907288283109665,
|
|
"rewards/accuracy_reward_long_step": 0.48828125,
|
|
"rewards/final_brier_reward_long_step": 0.668144941329956,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7709013223648071,
|
|
"step": 116
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 400.0,
|
|
"completions/max_terminated_length": 400.0,
|
|
"completions/mean_length": 199.77734375,
|
|
"completions/mean_terminated_length": 199.77734375,
|
|
"completions/min_length": 84.0,
|
|
"completions/min_terminated_length": 84.0,
|
|
"epoch": 0.1872,
|
|
"grad_norm": 0.031136956065893173,
|
|
"learning_rate": 9.03914590747331e-07,
|
|
"loss": -0.0184,
|
|
"num_tokens": 57605152.0,
|
|
"reward": 1.36716890335083,
|
|
"reward_std": 0.19431088864803314,
|
|
"rewards/accuracy_reward_long_step": 0.5078125,
|
|
"rewards/final_brier_reward_long_step": 0.6628949642181396,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7745306491851807,
|
|
"step": 117
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 614.0,
|
|
"completions/max_terminated_length": 614.0,
|
|
"completions/mean_length": 202.734375,
|
|
"completions/mean_terminated_length": 202.734375,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.1888,
|
|
"grad_norm": 0.03184520825743675,
|
|
"learning_rate": 9.02135231316726e-07,
|
|
"loss": -0.0108,
|
|
"num_tokens": 58073260.0,
|
|
"reward": 1.2726809978485107,
|
|
"reward_std": 0.20508110523223877,
|
|
"rewards/accuracy_reward_long_step": 0.3984375,
|
|
"rewards/final_brier_reward_long_step": 0.7245535254478455,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7802332639694214,
|
|
"step": 118
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 504.0,
|
|
"completions/max_terminated_length": 504.0,
|
|
"completions/mean_length": 205.05859375,
|
|
"completions/mean_terminated_length": 205.05859375,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.1904,
|
|
"grad_norm": 0.033281709998846054,
|
|
"learning_rate": 9.00355871886121e-07,
|
|
"loss": 0.0056,
|
|
"num_tokens": 58547163.0,
|
|
"reward": 1.2008931636810303,
|
|
"reward_std": 0.12888304889202118,
|
|
"rewards/accuracy_reward_long_step": 0.30078125,
|
|
"rewards/final_brier_reward_long_step": 0.7928597927093506,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8075879216194153,
|
|
"step": 119
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 471.0,
|
|
"completions/max_terminated_length": 471.0,
|
|
"completions/mean_length": 207.6875,
|
|
"completions/mean_terminated_length": 207.6875,
|
|
"completions/min_length": 116.0,
|
|
"completions/min_terminated_length": 116.0,
|
|
"epoch": 0.192,
|
|
"grad_norm": 0.03089357167482376,
|
|
"learning_rate": 8.98576512455516e-07,
|
|
"loss": -0.0066,
|
|
"num_tokens": 59008507.0,
|
|
"reward": 1.337794303894043,
|
|
"reward_std": 0.14488165080547333,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.6958640813827515,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.78031325340271,
|
|
"step": 120
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 402.0,
|
|
"completions/max_terminated_length": 402.0,
|
|
"completions/mean_length": 210.828125,
|
|
"completions/mean_terminated_length": 210.828125,
|
|
"completions/min_length": 96.0,
|
|
"completions/min_terminated_length": 96.0,
|
|
"epoch": 0.1936,
|
|
"grad_norm": 0.03139025717973709,
|
|
"learning_rate": 8.96797153024911e-07,
|
|
"loss": 0.0168,
|
|
"num_tokens": 59465479.0,
|
|
"reward": 1.46078622341156,
|
|
"reward_std": 0.2090751975774765,
|
|
"rewards/accuracy_reward_long_step": 0.6171875,
|
|
"rewards/final_brier_reward_long_step": 0.6339257955551147,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7404694557189941,
|
|
"step": 121
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 448.0,
|
|
"completions/max_terminated_length": 448.0,
|
|
"completions/mean_length": 225.23828125,
|
|
"completions/mean_terminated_length": 225.23828125,
|
|
"completions/min_length": 108.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.1952,
|
|
"grad_norm": 0.03497195616364479,
|
|
"learning_rate": 8.950177935943059e-07,
|
|
"loss": 0.0103,
|
|
"num_tokens": 59950300.0,
|
|
"reward": 1.329277515411377,
|
|
"reward_std": 0.17536477744579315,
|
|
"rewards/accuracy_reward_long_step": 0.453125,
|
|
"rewards/final_brier_reward_long_step": 0.7139711380004883,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7906389832496643,
|
|
"step": 122
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 491.0,
|
|
"completions/max_terminated_length": 491.0,
|
|
"completions/mean_length": 213.8671875,
|
|
"completions/mean_terminated_length": 213.8671875,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.1968,
|
|
"grad_norm": 0.03257234767079353,
|
|
"learning_rate": 8.93238434163701e-07,
|
|
"loss": -0.0167,
|
|
"num_tokens": 60433226.0,
|
|
"reward": 1.4588714838027954,
|
|
"reward_std": 0.15642720460891724,
|
|
"rewards/accuracy_reward_long_step": 0.59375,
|
|
"rewards/final_brier_reward_long_step": 0.6580198407173157,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8024659752845764,
|
|
"step": 123
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 456.0,
|
|
"completions/max_terminated_length": 456.0,
|
|
"completions/mean_length": 217.0,
|
|
"completions/mean_terminated_length": 217.0,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.1984,
|
|
"grad_norm": 0.03485409542918205,
|
|
"learning_rate": 8.91459074733096e-07,
|
|
"loss": 0.0054,
|
|
"num_tokens": 60912170.0,
|
|
"reward": 1.2661014795303345,
|
|
"reward_std": 0.16737821698188782,
|
|
"rewards/accuracy_reward_long_step": 0.3671875,
|
|
"rewards/final_brier_reward_long_step": 0.7646335959434509,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8310222625732422,
|
|
"step": 124
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 456.0,
|
|
"completions/max_terminated_length": 456.0,
|
|
"completions/mean_length": 222.78515625,
|
|
"completions/mean_terminated_length": 222.78515625,
|
|
"completions/min_length": 101.0,
|
|
"completions/min_terminated_length": 101.0,
|
|
"epoch": 0.2,
|
|
"grad_norm": 0.030056415125727654,
|
|
"learning_rate": 8.896797153024911e-07,
|
|
"loss": 0.0011,
|
|
"num_tokens": 61399619.0,
|
|
"reward": 1.3533146381378174,
|
|
"reward_std": 0.18137666583061218,
|
|
"rewards/accuracy_reward_long_step": 0.47265625,
|
|
"rewards/final_brier_reward_long_step": 0.7190214991569519,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8036117553710938,
|
|
"step": 125
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 397.0,
|
|
"completions/max_terminated_length": 397.0,
|
|
"completions/mean_length": 212.23046875,
|
|
"completions/mean_terminated_length": 212.23046875,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.2016,
|
|
"grad_norm": 0.04059338942170143,
|
|
"learning_rate": 8.879003558718861e-07,
|
|
"loss": 0.0105,
|
|
"num_tokens": 61886622.0,
|
|
"reward": 1.386685848236084,
|
|
"reward_std": 0.2339630126953125,
|
|
"rewards/accuracy_reward_long_step": 0.50390625,
|
|
"rewards/final_brier_reward_long_step": 0.7192109823226929,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8119070529937744,
|
|
"step": 126
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 399.0,
|
|
"completions/max_terminated_length": 399.0,
|
|
"completions/mean_length": 212.375,
|
|
"completions/mean_terminated_length": 212.375,
|
|
"completions/min_length": 113.0,
|
|
"completions/min_terminated_length": 113.0,
|
|
"epoch": 0.2032,
|
|
"grad_norm": 0.16263210773468018,
|
|
"learning_rate": 8.861209964412811e-07,
|
|
"loss": 0.0043,
|
|
"num_tokens": 62374030.0,
|
|
"reward": 1.2957684993743896,
|
|
"reward_std": 0.15368527173995972,
|
|
"rewards/accuracy_reward_long_step": 0.40625,
|
|
"rewards/final_brier_reward_long_step": 0.7560636401176453,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8098228573799133,
|
|
"step": 127
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 431.0,
|
|
"completions/max_terminated_length": 431.0,
|
|
"completions/mean_length": 215.7109375,
|
|
"completions/mean_terminated_length": 215.7109375,
|
|
"completions/min_length": 118.0,
|
|
"completions/min_terminated_length": 118.0,
|
|
"epoch": 0.2048,
|
|
"grad_norm": 0.039368387311697006,
|
|
"learning_rate": 8.843416370106761e-07,
|
|
"loss": 0.0028,
|
|
"num_tokens": 62861236.0,
|
|
"reward": 1.355287790298462,
|
|
"reward_std": 0.21285982429981232,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.7233257293701172,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7915753126144409,
|
|
"step": 128
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 380.0,
|
|
"completions/max_terminated_length": 380.0,
|
|
"completions/mean_length": 208.1328125,
|
|
"completions/mean_terminated_length": 208.1328125,
|
|
"completions/min_length": 83.0,
|
|
"completions/min_terminated_length": 83.0,
|
|
"epoch": 0.2064,
|
|
"grad_norm": 0.03151082620024681,
|
|
"learning_rate": 8.825622775800712e-07,
|
|
"loss": -0.004,
|
|
"num_tokens": 63345814.0,
|
|
"reward": 1.3812355995178223,
|
|
"reward_std": 0.2573654055595398,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.7093117237091064,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8234432339668274,
|
|
"step": 129
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 514.0,
|
|
"completions/max_terminated_length": 514.0,
|
|
"completions/mean_length": 216.51171875,
|
|
"completions/mean_terminated_length": 216.51171875,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.208,
|
|
"grad_norm": 0.03233165293931961,
|
|
"learning_rate": 8.807829181494661e-07,
|
|
"loss": 0.0018,
|
|
"num_tokens": 63809681.0,
|
|
"reward": 1.3362829685211182,
|
|
"reward_std": 0.12012840807437897,
|
|
"rewards/accuracy_reward_long_step": 0.4453125,
|
|
"rewards/final_brier_reward_long_step": 0.7555733919143677,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8083083033561707,
|
|
"step": 130
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 494.0,
|
|
"completions/max_terminated_length": 494.0,
|
|
"completions/mean_length": 213.39453125,
|
|
"completions/mean_terminated_length": 213.39453125,
|
|
"completions/min_length": 120.0,
|
|
"completions/min_terminated_length": 120.0,
|
|
"epoch": 0.2096,
|
|
"grad_norm": 0.028420858085155487,
|
|
"learning_rate": 8.790035587188612e-07,
|
|
"loss": 0.0054,
|
|
"num_tokens": 64295150.0,
|
|
"reward": 1.3931207656860352,
|
|
"reward_std": 0.1800289750099182,
|
|
"rewards/accuracy_reward_long_step": 0.51953125,
|
|
"rewards/final_brier_reward_long_step": 0.7075101733207703,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7868478298187256,
|
|
"step": 131
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 471.0,
|
|
"completions/max_terminated_length": 471.0,
|
|
"completions/mean_length": 220.79296875,
|
|
"completions/mean_terminated_length": 220.79296875,
|
|
"completions/min_length": 133.0,
|
|
"completions/min_terminated_length": 133.0,
|
|
"epoch": 0.2112,
|
|
"grad_norm": 0.030808866024017334,
|
|
"learning_rate": 8.772241992882562e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 64780465.0,
|
|
"reward": 1.2357356548309326,
|
|
"reward_std": 0.1879829466342926,
|
|
"rewards/accuracy_reward_long_step": 0.33984375,
|
|
"rewards/final_brier_reward_long_step": 0.7859160304069519,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7976517081260681,
|
|
"step": 132
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 527.0,
|
|
"completions/max_terminated_length": 527.0,
|
|
"completions/mean_length": 210.00390625,
|
|
"completions/mean_terminated_length": 210.00390625,
|
|
"completions/min_length": 101.0,
|
|
"completions/min_terminated_length": 101.0,
|
|
"epoch": 0.2128,
|
|
"grad_norm": 0.030662264674901962,
|
|
"learning_rate": 8.754448398576512e-07,
|
|
"loss": -0.0054,
|
|
"num_tokens": 65255178.0,
|
|
"reward": 1.261284351348877,
|
|
"reward_std": 0.19841524958610535,
|
|
"rewards/accuracy_reward_long_step": 0.37890625,
|
|
"rewards/final_brier_reward_long_step": 0.7597503662109375,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7775742411613464,
|
|
"step": 133
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 371.0,
|
|
"completions/max_terminated_length": 371.0,
|
|
"completions/mean_length": 202.95703125,
|
|
"completions/mean_terminated_length": 202.95703125,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.2144,
|
|
"grad_norm": 0.03131137415766716,
|
|
"learning_rate": 8.736654804270462e-07,
|
|
"loss": 0.0037,
|
|
"num_tokens": 65725903.0,
|
|
"reward": 1.3632652759552002,
|
|
"reward_std": 0.15484619140625,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.7492175698280334,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7975935935974121,
|
|
"step": 134
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 501.0,
|
|
"completions/max_terminated_length": 501.0,
|
|
"completions/mean_length": 216.09375,
|
|
"completions/mean_terminated_length": 216.09375,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.216,
|
|
"grad_norm": 0.03075851872563362,
|
|
"learning_rate": 8.718861209964412e-07,
|
|
"loss": 0.0039,
|
|
"num_tokens": 66198535.0,
|
|
"reward": 1.4220490455627441,
|
|
"reward_std": 0.13649022579193115,
|
|
"rewards/accuracy_reward_long_step": 0.546875,
|
|
"rewards/final_brier_reward_long_step": 0.7138031125068665,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7868932485580444,
|
|
"step": 135
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 438.0,
|
|
"completions/max_terminated_length": 438.0,
|
|
"completions/mean_length": 216.6953125,
|
|
"completions/mean_terminated_length": 216.6953125,
|
|
"completions/min_length": 107.0,
|
|
"completions/min_terminated_length": 107.0,
|
|
"epoch": 0.2176,
|
|
"grad_norm": 0.03281315043568611,
|
|
"learning_rate": 8.701067615658363e-07,
|
|
"loss": 0.0138,
|
|
"num_tokens": 66696009.0,
|
|
"reward": 1.294492483139038,
|
|
"reward_std": 0.23467326164245605,
|
|
"rewards/accuracy_reward_long_step": 0.40234375,
|
|
"rewards/final_brier_reward_long_step": 0.7704480886459351,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.798146665096283,
|
|
"step": 136
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 420.0,
|
|
"completions/max_terminated_length": 420.0,
|
|
"completions/mean_length": 204.03515625,
|
|
"completions/mean_terminated_length": 204.03515625,
|
|
"completions/min_length": 94.0,
|
|
"completions/min_terminated_length": 94.0,
|
|
"epoch": 0.2192,
|
|
"grad_norm": 0.03420788049697876,
|
|
"learning_rate": 8.683274021352312e-07,
|
|
"loss": -0.0052,
|
|
"num_tokens": 67178186.0,
|
|
"reward": 1.439988613128662,
|
|
"reward_std": 0.19740483164787292,
|
|
"rewards/accuracy_reward_long_step": 0.55859375,
|
|
"rewards/final_brier_reward_long_step": 0.7100058794021606,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8155736923217773,
|
|
"step": 137
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 572.0,
|
|
"completions/max_terminated_length": 572.0,
|
|
"completions/mean_length": 208.546875,
|
|
"completions/mean_terminated_length": 208.546875,
|
|
"completions/min_length": 98.0,
|
|
"completions/min_terminated_length": 98.0,
|
|
"epoch": 0.2208,
|
|
"grad_norm": 0.030317138880491257,
|
|
"learning_rate": 8.665480427046264e-07,
|
|
"loss": -0.0,
|
|
"num_tokens": 67664638.0,
|
|
"reward": 1.3270107507705688,
|
|
"reward_std": 0.156023770570755,
|
|
"rewards/accuracy_reward_long_step": 0.4296875,
|
|
"rewards/final_brier_reward_long_step": 0.7715405821800232,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8255646228790283,
|
|
"step": 138
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 502.0,
|
|
"completions/max_terminated_length": 502.0,
|
|
"completions/mean_length": 216.421875,
|
|
"completions/mean_terminated_length": 216.421875,
|
|
"completions/min_length": 106.0,
|
|
"completions/min_terminated_length": 106.0,
|
|
"epoch": 0.2224,
|
|
"grad_norm": 0.0344645120203495,
|
|
"learning_rate": 8.647686832740213e-07,
|
|
"loss": -0.0067,
|
|
"num_tokens": 68146034.0,
|
|
"reward": 1.2016233205795288,
|
|
"reward_std": 0.17633959650993347,
|
|
"rewards/accuracy_reward_long_step": 0.30078125,
|
|
"rewards/final_brier_reward_long_step": 0.7998050451278687,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.803563117980957,
|
|
"step": 139
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 393.0,
|
|
"completions/max_terminated_length": 393.0,
|
|
"completions/mean_length": 209.14453125,
|
|
"completions/mean_terminated_length": 209.14453125,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.224,
|
|
"grad_norm": 0.029939748346805573,
|
|
"learning_rate": 8.629893238434164e-07,
|
|
"loss": -0.0057,
|
|
"num_tokens": 68631727.0,
|
|
"reward": 1.4235725402832031,
|
|
"reward_std": 0.14265938103199005,
|
|
"rewards/accuracy_reward_long_step": 0.55078125,
|
|
"rewards/final_brier_reward_long_step": 0.720478892326355,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7706866264343262,
|
|
"step": 140
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 362.0,
|
|
"completions/max_terminated_length": 362.0,
|
|
"completions/mean_length": 209.578125,
|
|
"completions/mean_terminated_length": 209.578125,
|
|
"completions/min_length": 109.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 0.2256,
|
|
"grad_norm": 0.030273519456386566,
|
|
"learning_rate": 8.612099644128114e-07,
|
|
"loss": -0.0046,
|
|
"num_tokens": 69117867.0,
|
|
"reward": 1.3343067169189453,
|
|
"reward_std": 0.16028451919555664,
|
|
"rewards/accuracy_reward_long_step": 0.453125,
|
|
"rewards/final_brier_reward_long_step": 0.7540902495384216,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7784492373466492,
|
|
"step": 141
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 426.0,
|
|
"completions/max_terminated_length": 426.0,
|
|
"completions/mean_length": 220.03515625,
|
|
"completions/mean_terminated_length": 220.03515625,
|
|
"completions/min_length": 95.0,
|
|
"completions/min_terminated_length": 95.0,
|
|
"epoch": 0.2272,
|
|
"grad_norm": 0.03172041475772858,
|
|
"learning_rate": 8.594306049822063e-07,
|
|
"loss": 0.0148,
|
|
"num_tokens": 69616820.0,
|
|
"reward": 1.3687831163406372,
|
|
"reward_std": 0.12923522293567657,
|
|
"rewards/accuracy_reward_long_step": 0.48828125,
|
|
"rewards/final_brier_reward_long_step": 0.7290538549423218,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7929534316062927,
|
|
"step": 142
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 428.0,
|
|
"completions/max_terminated_length": 428.0,
|
|
"completions/mean_length": 218.36328125,
|
|
"completions/mean_terminated_length": 218.36328125,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.2288,
|
|
"grad_norm": 0.03149149566888809,
|
|
"learning_rate": 8.576512455516014e-07,
|
|
"loss": 0.0306,
|
|
"num_tokens": 70114945.0,
|
|
"reward": 1.3903582096099854,
|
|
"reward_std": 0.25830644369125366,
|
|
"rewards/accuracy_reward_long_step": 0.52734375,
|
|
"rewards/final_brier_reward_long_step": 0.6927156448364258,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7593424320220947,
|
|
"step": 143
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 450.0,
|
|
"completions/max_terminated_length": 450.0,
|
|
"completions/mean_length": 208.25390625,
|
|
"completions/mean_terminated_length": 208.25390625,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.2304,
|
|
"grad_norm": 0.03169076144695282,
|
|
"learning_rate": 8.558718861209963e-07,
|
|
"loss": 0.0027,
|
|
"num_tokens": 70590714.0,
|
|
"reward": 1.319934606552124,
|
|
"reward_std": 0.18163828551769257,
|
|
"rewards/accuracy_reward_long_step": 0.4296875,
|
|
"rewards/final_brier_reward_long_step": 0.7742601633071899,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7867279052734375,
|
|
"step": 144
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 407.0,
|
|
"completions/max_terminated_length": 407.0,
|
|
"completions/mean_length": 207.875,
|
|
"completions/mean_terminated_length": 207.875,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.232,
|
|
"grad_norm": 0.055429354310035706,
|
|
"learning_rate": 8.540925266903915e-07,
|
|
"loss": 0.0016,
|
|
"num_tokens": 71085394.0,
|
|
"reward": 1.3640680313110352,
|
|
"reward_std": 0.20486664772033691,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.7340711355209351,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7847013473510742,
|
|
"step": 145
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 375.0,
|
|
"completions/max_terminated_length": 375.0,
|
|
"completions/mean_length": 218.2109375,
|
|
"completions/mean_terminated_length": 218.2109375,
|
|
"completions/min_length": 108.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.2336,
|
|
"grad_norm": 0.03302358463406563,
|
|
"learning_rate": 8.523131672597864e-07,
|
|
"loss": 0.0093,
|
|
"num_tokens": 71574600.0,
|
|
"reward": 1.4761652946472168,
|
|
"reward_std": 0.2185746729373932,
|
|
"rewards/accuracy_reward_long_step": 0.61328125,
|
|
"rewards/final_brier_reward_long_step": 0.6924906373023987,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7668579816818237,
|
|
"step": 146
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 477.0,
|
|
"completions/max_terminated_length": 477.0,
|
|
"completions/mean_length": 209.34765625,
|
|
"completions/mean_terminated_length": 209.34765625,
|
|
"completions/min_length": 109.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 0.2352,
|
|
"grad_norm": 0.03392917290329933,
|
|
"learning_rate": 8.505338078291815e-07,
|
|
"loss": -0.0052,
|
|
"num_tokens": 72047361.0,
|
|
"reward": 1.4971990585327148,
|
|
"reward_std": 0.17058077454566956,
|
|
"rewards/accuracy_reward_long_step": 0.6328125,
|
|
"rewards/final_brier_reward_long_step": 0.6820136904716492,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7755322456359863,
|
|
"step": 147
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 396.0,
|
|
"completions/max_terminated_length": 396.0,
|
|
"completions/mean_length": 208.20703125,
|
|
"completions/mean_terminated_length": 208.20703125,
|
|
"completions/min_length": 88.0,
|
|
"completions/min_terminated_length": 88.0,
|
|
"epoch": 0.2368,
|
|
"grad_norm": 0.03449048101902008,
|
|
"learning_rate": 8.487544483985765e-07,
|
|
"loss": 0.0062,
|
|
"num_tokens": 72528174.0,
|
|
"reward": 1.5363779067993164,
|
|
"reward_std": 0.17959806323051453,
|
|
"rewards/accuracy_reward_long_step": 0.671875,
|
|
"rewards/final_brier_reward_long_step": 0.7153710722923279,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7426407337188721,
|
|
"step": 148
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 439.0,
|
|
"completions/max_terminated_length": 439.0,
|
|
"completions/mean_length": 208.875,
|
|
"completions/mean_terminated_length": 208.875,
|
|
"completions/min_length": 125.0,
|
|
"completions/min_terminated_length": 125.0,
|
|
"epoch": 0.2384,
|
|
"grad_norm": 0.030924499034881592,
|
|
"learning_rate": 8.469750889679715e-07,
|
|
"loss": 0.0016,
|
|
"num_tokens": 73014590.0,
|
|
"reward": 1.2735717296600342,
|
|
"reward_std": 0.16099971532821655,
|
|
"rewards/accuracy_reward_long_step": 0.38671875,
|
|
"rewards/final_brier_reward_long_step": 0.7689594030380249,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7784522771835327,
|
|
"step": 149
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 430.0,
|
|
"completions/max_terminated_length": 430.0,
|
|
"completions/mean_length": 222.2109375,
|
|
"completions/mean_terminated_length": 222.2109375,
|
|
"completions/min_length": 116.0,
|
|
"completions/min_terminated_length": 116.0,
|
|
"epoch": 0.24,
|
|
"grad_norm": 0.0321771465241909,
|
|
"learning_rate": 8.451957295373665e-07,
|
|
"loss": -0.0022,
|
|
"num_tokens": 73481692.0,
|
|
"reward": 1.3676480054855347,
|
|
"reward_std": 0.2422865331172943,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.7435883283615112,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8051284551620483,
|
|
"step": 150
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 500.0,
|
|
"completions/max_terminated_length": 500.0,
|
|
"completions/mean_length": 215.85546875,
|
|
"completions/mean_terminated_length": 215.85546875,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.2416,
|
|
"grad_norm": 0.03657348453998566,
|
|
"learning_rate": 8.434163701067614e-07,
|
|
"loss": 0.0096,
|
|
"num_tokens": 73961575.0,
|
|
"reward": 1.4443353414535522,
|
|
"reward_std": 0.21228715777397156,
|
|
"rewards/accuracy_reward_long_step": 0.55859375,
|
|
"rewards/final_brier_reward_long_step": 0.7566316723823547,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7863348126411438,
|
|
"step": 151
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 398.0,
|
|
"completions/max_terminated_length": 398.0,
|
|
"completions/mean_length": 213.53515625,
|
|
"completions/mean_terminated_length": 213.53515625,
|
|
"completions/min_length": 89.0,
|
|
"completions/min_terminated_length": 89.0,
|
|
"epoch": 0.2432,
|
|
"grad_norm": 0.03434426710009575,
|
|
"learning_rate": 8.416370106761566e-07,
|
|
"loss": 0.0015,
|
|
"num_tokens": 74427848.0,
|
|
"reward": 1.223260521888733,
|
|
"reward_std": 0.18570977449417114,
|
|
"rewards/accuracy_reward_long_step": 0.33203125,
|
|
"rewards/final_brier_reward_long_step": 0.781054675579071,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7838626503944397,
|
|
"step": 152
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 469.0,
|
|
"completions/max_terminated_length": 469.0,
|
|
"completions/mean_length": 219.49609375,
|
|
"completions/mean_terminated_length": 219.49609375,
|
|
"completions/min_length": 120.0,
|
|
"completions/min_terminated_length": 120.0,
|
|
"epoch": 0.2448,
|
|
"grad_norm": 0.029247252270579338,
|
|
"learning_rate": 8.398576512455516e-07,
|
|
"loss": 0.0116,
|
|
"num_tokens": 74910703.0,
|
|
"reward": 1.4385360479354858,
|
|
"reward_std": 0.24247096478939056,
|
|
"rewards/accuracy_reward_long_step": 0.56640625,
|
|
"rewards/final_brier_reward_long_step": 0.7055359482765198,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7829832434654236,
|
|
"step": 153
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 376.0,
|
|
"completions/max_terminated_length": 376.0,
|
|
"completions/mean_length": 220.2734375,
|
|
"completions/mean_terminated_length": 220.2734375,
|
|
"completions/min_length": 100.0,
|
|
"completions/min_terminated_length": 100.0,
|
|
"epoch": 0.2464,
|
|
"grad_norm": 0.02955593541264534,
|
|
"learning_rate": 8.380782918149466e-07,
|
|
"loss": -0.0045,
|
|
"num_tokens": 75393821.0,
|
|
"reward": 1.454419732093811,
|
|
"reward_std": 0.15751710534095764,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.7552148699760437,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7812142968177795,
|
|
"step": 154
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 474.0,
|
|
"completions/max_terminated_length": 474.0,
|
|
"completions/mean_length": 218.16796875,
|
|
"completions/mean_terminated_length": 218.16796875,
|
|
"completions/min_length": 85.0,
|
|
"completions/min_terminated_length": 85.0,
|
|
"epoch": 0.248,
|
|
"grad_norm": 0.03182348608970642,
|
|
"learning_rate": 8.362989323843416e-07,
|
|
"loss": 0.0059,
|
|
"num_tokens": 75881664.0,
|
|
"reward": 1.4293267726898193,
|
|
"reward_std": 0.20279854536056519,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.7694789171218872,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7915782928466797,
|
|
"step": 155
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 496.0,
|
|
"completions/max_terminated_length": 496.0,
|
|
"completions/mean_length": 238.328125,
|
|
"completions/mean_terminated_length": 238.328125,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.2496,
|
|
"grad_norm": 0.030359633266925812,
|
|
"learning_rate": 8.345195729537366e-07,
|
|
"loss": 0.0028,
|
|
"num_tokens": 76368372.0,
|
|
"reward": 1.3188002109527588,
|
|
"reward_std": 0.2175511121749878,
|
|
"rewards/accuracy_reward_long_step": 0.43359375,
|
|
"rewards/final_brier_reward_long_step": 0.7726074457168579,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7682181596755981,
|
|
"step": 156
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 508.0,
|
|
"completions/max_terminated_length": 508.0,
|
|
"completions/mean_length": 225.96484375,
|
|
"completions/mean_terminated_length": 225.96484375,
|
|
"completions/min_length": 88.0,
|
|
"completions/min_terminated_length": 88.0,
|
|
"epoch": 0.2512,
|
|
"grad_norm": 0.03001994453370571,
|
|
"learning_rate": 8.327402135231316e-07,
|
|
"loss": 0.0162,
|
|
"num_tokens": 76846323.0,
|
|
"reward": 1.509331226348877,
|
|
"reward_std": 0.18848416209220886,
|
|
"rewards/accuracy_reward_long_step": 0.6171875,
|
|
"rewards/final_brier_reward_long_step": 0.7627733945846558,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.805801272392273,
|
|
"step": 157
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 431.0,
|
|
"completions/max_terminated_length": 431.0,
|
|
"completions/mean_length": 245.515625,
|
|
"completions/mean_terminated_length": 245.515625,
|
|
"completions/min_length": 116.0,
|
|
"completions/min_terminated_length": 116.0,
|
|
"epoch": 0.2528,
|
|
"grad_norm": 0.02940976247191429,
|
|
"learning_rate": 8.309608540925266e-07,
|
|
"loss": 0.0073,
|
|
"num_tokens": 77351735.0,
|
|
"reward": 1.2970399856567383,
|
|
"reward_std": 0.19197387993335724,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.7378246188163757,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8018975853919983,
|
|
"step": 158
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 602.0,
|
|
"completions/max_terminated_length": 602.0,
|
|
"completions/mean_length": 238.7890625,
|
|
"completions/mean_terminated_length": 238.7890625,
|
|
"completions/min_length": 134.0,
|
|
"completions/min_terminated_length": 134.0,
|
|
"epoch": 0.2544,
|
|
"grad_norm": 0.030583331361413002,
|
|
"learning_rate": 8.291814946619217e-07,
|
|
"loss": 0.0032,
|
|
"num_tokens": 77840273.0,
|
|
"reward": 1.191973090171814,
|
|
"reward_std": 0.15967227518558502,
|
|
"rewards/accuracy_reward_long_step": 0.29296875,
|
|
"rewards/final_brier_reward_long_step": 0.7833398580551147,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8126777410507202,
|
|
"step": 159
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 447.0,
|
|
"completions/max_terminated_length": 447.0,
|
|
"completions/mean_length": 222.33984375,
|
|
"completions/mean_terminated_length": 222.33984375,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.256,
|
|
"grad_norm": 0.02875317633152008,
|
|
"learning_rate": 8.274021352313167e-07,
|
|
"loss": 0.001,
|
|
"num_tokens": 78315032.0,
|
|
"reward": 1.4728080034255981,
|
|
"reward_std": 0.1814574897289276,
|
|
"rewards/accuracy_reward_long_step": 0.578125,
|
|
"rewards/final_brier_reward_long_step": 0.7983136773109436,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7804182767868042,
|
|
"step": 160
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 396.0,
|
|
"completions/max_terminated_length": 396.0,
|
|
"completions/mean_length": 243.73828125,
|
|
"completions/mean_terminated_length": 243.73828125,
|
|
"completions/min_length": 133.0,
|
|
"completions/min_terminated_length": 133.0,
|
|
"epoch": 0.2576,
|
|
"grad_norm": 0.028153013437986374,
|
|
"learning_rate": 8.256227758007117e-07,
|
|
"loss": -0.0008,
|
|
"num_tokens": 78800285.0,
|
|
"reward": 1.3710644245147705,
|
|
"reward_std": 0.21033860743045807,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.7794238328933716,
|
|
"rewards/format_reward_long_step": 0.9921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7829592227935791,
|
|
"step": 161
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 539.0,
|
|
"completions/max_terminated_length": 539.0,
|
|
"completions/mean_length": 247.23828125,
|
|
"completions/mean_terminated_length": 247.23828125,
|
|
"completions/min_length": 139.0,
|
|
"completions/min_terminated_length": 139.0,
|
|
"epoch": 0.2592,
|
|
"grad_norm": 0.02903878502547741,
|
|
"learning_rate": 8.238434163701067e-07,
|
|
"loss": 0.0002,
|
|
"num_tokens": 79278586.0,
|
|
"reward": 1.3124269247055054,
|
|
"reward_std": 0.22732782363891602,
|
|
"rewards/accuracy_reward_long_step": 0.42578125,
|
|
"rewards/final_brier_reward_long_step": 0.7707054615020752,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7758771181106567,
|
|
"step": 162
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 574.0,
|
|
"completions/max_terminated_length": 574.0,
|
|
"completions/mean_length": 249.76171875,
|
|
"completions/mean_terminated_length": 249.76171875,
|
|
"completions/min_length": 146.0,
|
|
"completions/min_terminated_length": 146.0,
|
|
"epoch": 0.2608,
|
|
"grad_norm": 0.028138713911175728,
|
|
"learning_rate": 8.220640569395017e-07,
|
|
"loss": -0.0073,
|
|
"num_tokens": 79765109.0,
|
|
"reward": 1.4664216041564941,
|
|
"reward_std": 0.16438095271587372,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.7759047150611877,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.808531641960144,
|
|
"step": 163
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 516.0,
|
|
"completions/max_terminated_length": 516.0,
|
|
"completions/mean_length": 241.08984375,
|
|
"completions/mean_terminated_length": 241.08984375,
|
|
"completions/min_length": 147.0,
|
|
"completions/min_terminated_length": 147.0,
|
|
"epoch": 0.2624,
|
|
"grad_norm": 0.028392404317855835,
|
|
"learning_rate": 8.202846975088967e-07,
|
|
"loss": 0.0032,
|
|
"num_tokens": 80258676.0,
|
|
"reward": 1.501215934753418,
|
|
"reward_std": 0.17474979162216187,
|
|
"rewards/accuracy_reward_long_step": 0.6171875,
|
|
"rewards/final_brier_reward_long_step": 0.7508887052536011,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7852252721786499,
|
|
"step": 164
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 460.0,
|
|
"completions/max_terminated_length": 460.0,
|
|
"completions/mean_length": 231.37109375,
|
|
"completions/mean_terminated_length": 231.37109375,
|
|
"completions/min_length": 134.0,
|
|
"completions/min_terminated_length": 134.0,
|
|
"epoch": 0.264,
|
|
"grad_norm": 0.028890179470181465,
|
|
"learning_rate": 8.185053380782919e-07,
|
|
"loss": 0.0041,
|
|
"num_tokens": 80732043.0,
|
|
"reward": 1.4701359272003174,
|
|
"reward_std": 0.1824515014886856,
|
|
"rewards/accuracy_reward_long_step": 0.58203125,
|
|
"rewards/final_brier_reward_long_step": 0.7417089939117432,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8107101321220398,
|
|
"step": 165
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 447.0,
|
|
"completions/max_terminated_length": 447.0,
|
|
"completions/mean_length": 238.31640625,
|
|
"completions/mean_terminated_length": 238.31640625,
|
|
"completions/min_length": 147.0,
|
|
"completions/min_terminated_length": 147.0,
|
|
"epoch": 0.2656,
|
|
"grad_norm": 0.04968814551830292,
|
|
"learning_rate": 8.167259786476868e-07,
|
|
"loss": 0.0087,
|
|
"num_tokens": 81219316.0,
|
|
"reward": 1.3115195035934448,
|
|
"reward_std": 0.1865576058626175,
|
|
"rewards/accuracy_reward_long_step": 0.421875,
|
|
"rewards/final_brier_reward_long_step": 0.7827734351158142,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7758046984672546,
|
|
"step": 166
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 473.0,
|
|
"completions/max_terminated_length": 473.0,
|
|
"completions/mean_length": 246.90625,
|
|
"completions/mean_terminated_length": 246.90625,
|
|
"completions/min_length": 151.0,
|
|
"completions/min_terminated_length": 151.0,
|
|
"epoch": 0.2672,
|
|
"grad_norm": 0.02806948870420456,
|
|
"learning_rate": 8.149466192170819e-07,
|
|
"loss": -0.0126,
|
|
"num_tokens": 81722972.0,
|
|
"reward": 1.287018060684204,
|
|
"reward_std": 0.17807143926620483,
|
|
"rewards/accuracy_reward_long_step": 0.41015625,
|
|
"rewards/final_brier_reward_long_step": 0.7411332130432129,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.766313910484314,
|
|
"step": 167
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 430.0,
|
|
"completions/max_terminated_length": 430.0,
|
|
"completions/mean_length": 242.71875,
|
|
"completions/mean_terminated_length": 242.71875,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 0.2688,
|
|
"grad_norm": 0.02786392532289028,
|
|
"learning_rate": 8.131672597864768e-07,
|
|
"loss": -0.0143,
|
|
"num_tokens": 82208676.0,
|
|
"reward": 1.3826302289962769,
|
|
"reward_std": 0.16845399141311646,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.7937402129173279,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8305305242538452,
|
|
"step": 168
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 408.0,
|
|
"completions/max_terminated_length": 408.0,
|
|
"completions/mean_length": 248.65234375,
|
|
"completions/mean_terminated_length": 248.65234375,
|
|
"completions/min_length": 149.0,
|
|
"completions/min_terminated_length": 149.0,
|
|
"epoch": 0.2704,
|
|
"grad_norm": 0.02810235507786274,
|
|
"learning_rate": 8.113879003558719e-07,
|
|
"loss": -0.0028,
|
|
"num_tokens": 82703459.0,
|
|
"reward": 1.2465064525604248,
|
|
"reward_std": 0.1827697902917862,
|
|
"rewards/accuracy_reward_long_step": 0.375,
|
|
"rewards/final_brier_reward_long_step": 0.701416015625,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7846096754074097,
|
|
"step": 169
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 616.0,
|
|
"completions/max_terminated_length": 616.0,
|
|
"completions/mean_length": 256.98046875,
|
|
"completions/mean_terminated_length": 256.98046875,
|
|
"completions/min_length": 154.0,
|
|
"completions/min_terminated_length": 154.0,
|
|
"epoch": 0.272,
|
|
"grad_norm": 0.03040560707449913,
|
|
"learning_rate": 8.096085409252668e-07,
|
|
"loss": -0.0086,
|
|
"num_tokens": 83174350.0,
|
|
"reward": 1.3326289653778076,
|
|
"reward_std": 0.19962584972381592,
|
|
"rewards/accuracy_reward_long_step": 0.44921875,
|
|
"rewards/final_brier_reward_long_step": 0.7591210603713989,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7823324203491211,
|
|
"step": 170
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 595.0,
|
|
"completions/max_terminated_length": 595.0,
|
|
"completions/mean_length": 250.66015625,
|
|
"completions/mean_terminated_length": 250.66015625,
|
|
"completions/min_length": 149.0,
|
|
"completions/min_terminated_length": 149.0,
|
|
"epoch": 0.2736,
|
|
"grad_norm": 0.02771839126944542,
|
|
"learning_rate": 8.078291814946618e-07,
|
|
"loss": -0.0059,
|
|
"num_tokens": 83664551.0,
|
|
"reward": 1.3717570304870605,
|
|
"reward_std": 0.20381051301956177,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.7613476514816284,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8038052320480347,
|
|
"step": 171
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 438.0,
|
|
"completions/max_terminated_length": 438.0,
|
|
"completions/mean_length": 245.0703125,
|
|
"completions/mean_terminated_length": 245.0703125,
|
|
"completions/min_length": 142.0,
|
|
"completions/min_terminated_length": 142.0,
|
|
"epoch": 0.2752,
|
|
"grad_norm": 0.03148366138339043,
|
|
"learning_rate": 8.06049822064057e-07,
|
|
"loss": 0.0026,
|
|
"num_tokens": 84163441.0,
|
|
"reward": 1.2595324516296387,
|
|
"reward_std": 0.21818827092647552,
|
|
"rewards/accuracy_reward_long_step": 0.37890625,
|
|
"rewards/final_brier_reward_long_step": 0.7574383020401001,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7650665044784546,
|
|
"step": 172
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 392.0,
|
|
"completions/max_terminated_length": 392.0,
|
|
"completions/mean_length": 241.73828125,
|
|
"completions/mean_terminated_length": 241.73828125,
|
|
"completions/min_length": 143.0,
|
|
"completions/min_terminated_length": 143.0,
|
|
"epoch": 0.2768,
|
|
"grad_norm": 0.027540108188986778,
|
|
"learning_rate": 8.042704626334519e-07,
|
|
"loss": 0.0026,
|
|
"num_tokens": 84651926.0,
|
|
"reward": 1.3201969861984253,
|
|
"reward_std": 0.23394638299942017,
|
|
"rewards/accuracy_reward_long_step": 0.4296875,
|
|
"rewards/final_brier_reward_long_step": 0.7422363758087158,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8198016285896301,
|
|
"step": 173
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 639.0,
|
|
"completions/max_terminated_length": 639.0,
|
|
"completions/mean_length": 245.08984375,
|
|
"completions/mean_terminated_length": 245.08984375,
|
|
"completions/min_length": 59.0,
|
|
"completions/min_terminated_length": 59.0,
|
|
"epoch": 0.2784,
|
|
"grad_norm": 0.02940957434475422,
|
|
"learning_rate": 8.02491103202847e-07,
|
|
"loss": -0.0074,
|
|
"num_tokens": 85147357.0,
|
|
"reward": 1.2906839847564697,
|
|
"reward_std": 0.2153157889842987,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.7498632669448853,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7644352912902832,
|
|
"step": 174
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 442.0,
|
|
"completions/max_terminated_length": 442.0,
|
|
"completions/mean_length": 245.8515625,
|
|
"completions/mean_terminated_length": 245.8515625,
|
|
"completions/min_length": 126.0,
|
|
"completions/min_terminated_length": 126.0,
|
|
"epoch": 0.28,
|
|
"grad_norm": 0.03165048733353615,
|
|
"learning_rate": 8.007117437722419e-07,
|
|
"loss": 0.0137,
|
|
"num_tokens": 85624735.0,
|
|
"reward": 1.4603183269500732,
|
|
"reward_std": 0.22203630208969116,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.7952343821525574,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.796039342880249,
|
|
"step": 175
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 407.0,
|
|
"completions/max_terminated_length": 407.0,
|
|
"completions/mean_length": 250.11328125,
|
|
"completions/mean_terminated_length": 251.09413146972656,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 150.0,
|
|
"epoch": 0.2816,
|
|
"grad_norm": 0.0285260621458292,
|
|
"learning_rate": 7.98932384341637e-07,
|
|
"loss": -0.0097,
|
|
"num_tokens": 86101692.0,
|
|
"reward": 1.4544804096221924,
|
|
"reward_std": 0.21490904688835144,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.7437987923622131,
|
|
"rewards/format_reward_long_step": 0.9921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8084980249404907,
|
|
"step": 176
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 387.0,
|
|
"completions/max_terminated_length": 387.0,
|
|
"completions/mean_length": 247.91015625,
|
|
"completions/mean_terminated_length": 247.91015625,
|
|
"completions/min_length": 151.0,
|
|
"completions/min_terminated_length": 151.0,
|
|
"epoch": 0.2832,
|
|
"grad_norm": 0.02845529466867447,
|
|
"learning_rate": 7.97153024911032e-07,
|
|
"loss": 0.0112,
|
|
"num_tokens": 86593685.0,
|
|
"reward": 1.3518202304840088,
|
|
"reward_std": 0.15090304613113403,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.7558691501617432,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7764118909835815,
|
|
"step": 177
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 719.0,
|
|
"completions/max_terminated_length": 719.0,
|
|
"completions/mean_length": 253.98046875,
|
|
"completions/mean_terminated_length": 253.98046875,
|
|
"completions/min_length": 146.0,
|
|
"completions/min_terminated_length": 146.0,
|
|
"epoch": 0.2848,
|
|
"grad_norm": 0.029682578518986702,
|
|
"learning_rate": 7.95373665480427e-07,
|
|
"loss": -0.0156,
|
|
"num_tokens": 87076488.0,
|
|
"reward": 1.3396403789520264,
|
|
"reward_std": 0.1541176736354828,
|
|
"rewards/accuracy_reward_long_step": 0.4375,
|
|
"rewards/final_brier_reward_long_step": 0.8080171346664429,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8005446195602417,
|
|
"step": 178
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 439.0,
|
|
"completions/max_terminated_length": 439.0,
|
|
"completions/mean_length": 248.99609375,
|
|
"completions/mean_terminated_length": 248.99609375,
|
|
"completions/min_length": 144.0,
|
|
"completions/min_terminated_length": 144.0,
|
|
"epoch": 0.2864,
|
|
"grad_norm": 0.028529809787869453,
|
|
"learning_rate": 7.935943060498221e-07,
|
|
"loss": 0.0108,
|
|
"num_tokens": 87564831.0,
|
|
"reward": 1.3985368013381958,
|
|
"reward_std": 0.15740060806274414,
|
|
"rewards/accuracy_reward_long_step": 0.49609375,
|
|
"rewards/final_brier_reward_long_step": 0.7856543064117432,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8241176605224609,
|
|
"step": 179
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 439.0,
|
|
"completions/max_terminated_length": 439.0,
|
|
"completions/mean_length": 242.43359375,
|
|
"completions/mean_terminated_length": 242.43359375,
|
|
"completions/min_length": 137.0,
|
|
"completions/min_terminated_length": 137.0,
|
|
"epoch": 0.288,
|
|
"grad_norm": 0.029515286907553673,
|
|
"learning_rate": 7.91814946619217e-07,
|
|
"loss": -0.0121,
|
|
"num_tokens": 88038206.0,
|
|
"reward": 1.4421117305755615,
|
|
"reward_std": 0.245658278465271,
|
|
"rewards/accuracy_reward_long_step": 0.56640625,
|
|
"rewards/final_brier_reward_long_step": 0.7302929759025574,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7725290060043335,
|
|
"step": 180
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 479.0,
|
|
"completions/max_terminated_length": 479.0,
|
|
"completions/mean_length": 247.80859375,
|
|
"completions/mean_terminated_length": 247.80859375,
|
|
"completions/min_length": 141.0,
|
|
"completions/min_terminated_length": 141.0,
|
|
"epoch": 0.2896,
|
|
"grad_norm": 0.028533408418297768,
|
|
"learning_rate": 7.900355871886121e-07,
|
|
"loss": -0.0021,
|
|
"num_tokens": 88526117.0,
|
|
"reward": 1.4392614364624023,
|
|
"reward_std": 0.1886727213859558,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.7036230564117432,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7721726894378662,
|
|
"step": 181
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 402.0,
|
|
"completions/max_terminated_length": 402.0,
|
|
"completions/mean_length": 247.95703125,
|
|
"completions/mean_terminated_length": 247.95703125,
|
|
"completions/min_length": 142.0,
|
|
"completions/min_terminated_length": 142.0,
|
|
"epoch": 0.2912,
|
|
"grad_norm": 0.03011815994977951,
|
|
"learning_rate": 7.88256227758007e-07,
|
|
"loss": 0.0105,
|
|
"num_tokens": 89007906.0,
|
|
"reward": 1.3243916034698486,
|
|
"reward_std": 0.23409831523895264,
|
|
"rewards/accuracy_reward_long_step": 0.44921875,
|
|
"rewards/final_brier_reward_long_step": 0.7340039014816284,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7666873931884766,
|
|
"step": 182
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 520.0,
|
|
"completions/max_terminated_length": 520.0,
|
|
"completions/mean_length": 257.90234375,
|
|
"completions/mean_terminated_length": 257.90234375,
|
|
"completions/min_length": 138.0,
|
|
"completions/min_terminated_length": 138.0,
|
|
"epoch": 0.2928,
|
|
"grad_norm": 0.02743351273238659,
|
|
"learning_rate": 7.864768683274021e-07,
|
|
"loss": 0.0068,
|
|
"num_tokens": 89487889.0,
|
|
"reward": 1.3753552436828613,
|
|
"reward_std": 0.13086272776126862,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.7642577886581421,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7996633648872375,
|
|
"step": 183
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 714.0,
|
|
"completions/max_terminated_length": 714.0,
|
|
"completions/mean_length": 260.5,
|
|
"completions/mean_terminated_length": 260.5,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.2944,
|
|
"grad_norm": 0.02766694501042366,
|
|
"learning_rate": 7.846975088967971e-07,
|
|
"loss": 0.0127,
|
|
"num_tokens": 89978625.0,
|
|
"reward": 1.4390387535095215,
|
|
"reward_std": 0.17982302606105804,
|
|
"rewards/accuracy_reward_long_step": 0.55859375,
|
|
"rewards/final_brier_reward_long_step": 0.7528809309005737,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7688993811607361,
|
|
"step": 184
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 419.0,
|
|
"completions/max_terminated_length": 419.0,
|
|
"completions/mean_length": 271.25390625,
|
|
"completions/mean_terminated_length": 271.25390625,
|
|
"completions/min_length": 160.0,
|
|
"completions/min_terminated_length": 160.0,
|
|
"epoch": 0.296,
|
|
"grad_norm": 0.0269797183573246,
|
|
"learning_rate": 7.829181494661921e-07,
|
|
"loss": -0.0066,
|
|
"num_tokens": 90473290.0,
|
|
"reward": 1.3175511360168457,
|
|
"reward_std": 0.13507431745529175,
|
|
"rewards/accuracy_reward_long_step": 0.453125,
|
|
"rewards/final_brier_reward_long_step": 0.7091602087020874,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7485443353652954,
|
|
"step": 185
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 471.0,
|
|
"completions/max_terminated_length": 471.0,
|
|
"completions/mean_length": 268.40625,
|
|
"completions/mean_terminated_length": 268.40625,
|
|
"completions/min_length": 151.0,
|
|
"completions/min_terminated_length": 151.0,
|
|
"epoch": 0.2976,
|
|
"grad_norm": 0.027505241334438324,
|
|
"learning_rate": 7.811387900355872e-07,
|
|
"loss": -0.0114,
|
|
"num_tokens": 90958874.0,
|
|
"reward": 1.517210602760315,
|
|
"reward_std": 0.18232710659503937,
|
|
"rewards/accuracy_reward_long_step": 0.6171875,
|
|
"rewards/final_brier_reward_long_step": 0.8131054639816284,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7869866490364075,
|
|
"step": 186
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 481.0,
|
|
"completions/max_terminated_length": 481.0,
|
|
"completions/mean_length": 275.80859375,
|
|
"completions/mean_terminated_length": 275.80859375,
|
|
"completions/min_length": 149.0,
|
|
"completions/min_terminated_length": 149.0,
|
|
"epoch": 0.2992,
|
|
"grad_norm": 0.0265911016613245,
|
|
"learning_rate": 7.793594306049822e-07,
|
|
"loss": -0.0118,
|
|
"num_tokens": 91450809.0,
|
|
"reward": 1.4153180122375488,
|
|
"reward_std": 0.20222672820091248,
|
|
"rewards/accuracy_reward_long_step": 0.52734375,
|
|
"rewards/final_brier_reward_long_step": 0.724365234375,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8275318145751953,
|
|
"step": 187
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 516.0,
|
|
"completions/max_terminated_length": 516.0,
|
|
"completions/mean_length": 282.4375,
|
|
"completions/mean_terminated_length": 282.4375,
|
|
"completions/min_length": 130.0,
|
|
"completions/min_terminated_length": 130.0,
|
|
"epoch": 0.3008,
|
|
"grad_norm": 0.02835831232368946,
|
|
"learning_rate": 7.775800711743772e-07,
|
|
"loss": 0.0055,
|
|
"num_tokens": 91962913.0,
|
|
"reward": 1.3732486963272095,
|
|
"reward_std": 0.21462617814540863,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.7563574314117432,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.799137532711029,
|
|
"step": 188
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 505.0,
|
|
"completions/max_terminated_length": 505.0,
|
|
"completions/mean_length": 282.69140625,
|
|
"completions/mean_terminated_length": 282.69140625,
|
|
"completions/min_length": 158.0,
|
|
"completions/min_terminated_length": 158.0,
|
|
"epoch": 0.3024,
|
|
"grad_norm": 0.026161570101976395,
|
|
"learning_rate": 7.758007117437722e-07,
|
|
"loss": 0.0141,
|
|
"num_tokens": 92451986.0,
|
|
"reward": 1.4073671102523804,
|
|
"reward_std": 0.12184424698352814,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.825976550579071,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8034918308258057,
|
|
"step": 189
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 459.0,
|
|
"completions/max_terminated_length": 459.0,
|
|
"completions/mean_length": 286.796875,
|
|
"completions/mean_terminated_length": 286.796875,
|
|
"completions/min_length": 165.0,
|
|
"completions/min_terminated_length": 165.0,
|
|
"epoch": 0.304,
|
|
"grad_norm": 0.027012880891561508,
|
|
"learning_rate": 7.740213523131672e-07,
|
|
"loss": 0.0021,
|
|
"num_tokens": 92948694.0,
|
|
"reward": 1.2427858114242554,
|
|
"reward_std": 0.235196053981781,
|
|
"rewards/accuracy_reward_long_step": 0.37109375,
|
|
"rewards/final_brier_reward_long_step": 0.6898242235183716,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7969439625740051,
|
|
"step": 190
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 517.0,
|
|
"completions/max_terminated_length": 517.0,
|
|
"completions/mean_length": 282.46484375,
|
|
"completions/mean_terminated_length": 282.46484375,
|
|
"completions/min_length": 165.0,
|
|
"completions/min_terminated_length": 165.0,
|
|
"epoch": 0.3056,
|
|
"grad_norm": 0.037487372756004333,
|
|
"learning_rate": 7.722419928825622e-07,
|
|
"loss": 0.0056,
|
|
"num_tokens": 93449965.0,
|
|
"reward": 1.3565409183502197,
|
|
"reward_std": 0.14568164944648743,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.733447253704071,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8177168369293213,
|
|
"step": 191
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 547.0,
|
|
"completions/max_terminated_length": 547.0,
|
|
"completions/mean_length": 294.59375,
|
|
"completions/mean_terminated_length": 294.59375,
|
|
"completions/min_length": 164.0,
|
|
"completions/min_terminated_length": 164.0,
|
|
"epoch": 0.3072,
|
|
"grad_norm": 0.026451628655195236,
|
|
"learning_rate": 7.704626334519572e-07,
|
|
"loss": 0.0008,
|
|
"num_tokens": 93958261.0,
|
|
"reward": 1.1881611347198486,
|
|
"reward_std": 0.18037152290344238,
|
|
"rewards/accuracy_reward_long_step": 0.296875,
|
|
"rewards/final_brier_reward_long_step": 0.7727734446525574,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.792371392250061,
|
|
"step": 192
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 505.0,
|
|
"completions/max_terminated_length": 505.0,
|
|
"completions/mean_length": 290.57421875,
|
|
"completions/mean_terminated_length": 290.57421875,
|
|
"completions/min_length": 150.0,
|
|
"completions/min_terminated_length": 150.0,
|
|
"epoch": 0.3088,
|
|
"grad_norm": 0.026738133281469345,
|
|
"learning_rate": 7.686832740213523e-07,
|
|
"loss": 0.0111,
|
|
"num_tokens": 94465464.0,
|
|
"reward": 1.466090202331543,
|
|
"reward_std": 0.17279371619224548,
|
|
"rewards/accuracy_reward_long_step": 0.56640625,
|
|
"rewards/final_brier_reward_long_step": 0.7865039110183716,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8122318983078003,
|
|
"step": 193
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 511.0,
|
|
"completions/max_terminated_length": 511.0,
|
|
"completions/mean_length": 282.94921875,
|
|
"completions/mean_terminated_length": 282.94921875,
|
|
"completions/min_length": 157.0,
|
|
"completions/min_terminated_length": 157.0,
|
|
"epoch": 0.3104,
|
|
"grad_norm": 0.02935073897242546,
|
|
"learning_rate": 7.669039145907473e-07,
|
|
"loss": 0.0057,
|
|
"num_tokens": 94968371.0,
|
|
"reward": 1.3700523376464844,
|
|
"reward_std": 0.21206629276275635,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.7314281463623047,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8269064426422119,
|
|
"step": 194
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 532.0,
|
|
"completions/max_terminated_length": 532.0,
|
|
"completions/mean_length": 276.2578125,
|
|
"completions/mean_terminated_length": 276.2578125,
|
|
"completions/min_length": 161.0,
|
|
"completions/min_terminated_length": 161.0,
|
|
"epoch": 0.312,
|
|
"grad_norm": 0.028218043968081474,
|
|
"learning_rate": 7.651245551601423e-07,
|
|
"loss": -0.0055,
|
|
"num_tokens": 95464117.0,
|
|
"reward": 1.5009515285491943,
|
|
"reward_std": 0.16311804950237274,
|
|
"rewards/accuracy_reward_long_step": 0.60546875,
|
|
"rewards/final_brier_reward_long_step": 0.7480566501617432,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8338742256164551,
|
|
"step": 195
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 539.0,
|
|
"completions/max_terminated_length": 539.0,
|
|
"completions/mean_length": 277.2578125,
|
|
"completions/mean_terminated_length": 277.2578125,
|
|
"completions/min_length": 146.0,
|
|
"completions/min_terminated_length": 146.0,
|
|
"epoch": 0.3136,
|
|
"grad_norm": 0.029587451368570328,
|
|
"learning_rate": 7.633451957295374e-07,
|
|
"loss": 0.004,
|
|
"num_tokens": 95963519.0,
|
|
"reward": 1.6074358224868774,
|
|
"reward_std": 0.1871250867843628,
|
|
"rewards/accuracy_reward_long_step": 0.6953125,
|
|
"rewards/final_brier_reward_long_step": 0.8149710893630981,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8335224390029907,
|
|
"step": 196
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 495.0,
|
|
"completions/max_terminated_length": 495.0,
|
|
"completions/mean_length": 273.578125,
|
|
"completions/mean_terminated_length": 273.578125,
|
|
"completions/min_length": 164.0,
|
|
"completions/min_terminated_length": 164.0,
|
|
"epoch": 0.3152,
|
|
"grad_norm": 0.029551653191447258,
|
|
"learning_rate": 7.615658362989323e-07,
|
|
"loss": 0.008,
|
|
"num_tokens": 96461403.0,
|
|
"reward": 1.3781944513320923,
|
|
"reward_std": 0.17420879006385803,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.8260058164596558,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8117718696594238,
|
|
"step": 197
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 516.0,
|
|
"completions/max_terminated_length": 516.0,
|
|
"completions/mean_length": 274.83984375,
|
|
"completions/mean_terminated_length": 274.83984375,
|
|
"completions/min_length": 164.0,
|
|
"completions/min_terminated_length": 164.0,
|
|
"epoch": 0.3168,
|
|
"grad_norm": 0.02734399028122425,
|
|
"learning_rate": 7.597864768683274e-07,
|
|
"loss": 0.0036,
|
|
"num_tokens": 96960098.0,
|
|
"reward": 1.3913518190383911,
|
|
"reward_std": 0.20948463678359985,
|
|
"rewards/accuracy_reward_long_step": 0.50390625,
|
|
"rewards/final_brier_reward_long_step": 0.7432616949081421,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8065208792686462,
|
|
"step": 198
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 586.0,
|
|
"completions/max_terminated_length": 586.0,
|
|
"completions/mean_length": 285.73046875,
|
|
"completions/mean_terminated_length": 285.73046875,
|
|
"completions/min_length": 134.0,
|
|
"completions/min_terminated_length": 134.0,
|
|
"epoch": 0.3184,
|
|
"grad_norm": 0.025479217991232872,
|
|
"learning_rate": 7.580071174377223e-07,
|
|
"loss": -0.0064,
|
|
"num_tokens": 97462677.0,
|
|
"reward": 1.3470879793167114,
|
|
"reward_std": 0.17569580674171448,
|
|
"rewards/accuracy_reward_long_step": 0.46484375,
|
|
"rewards/final_brier_reward_long_step": 0.7766375541687012,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7523394823074341,
|
|
"step": 199
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 487.0,
|
|
"completions/max_terminated_length": 487.0,
|
|
"completions/mean_length": 276.9609375,
|
|
"completions/mean_terminated_length": 276.9609375,
|
|
"completions/min_length": 165.0,
|
|
"completions/min_terminated_length": 165.0,
|
|
"epoch": 0.32,
|
|
"grad_norm": 0.026502054184675217,
|
|
"learning_rate": 7.562277580071174e-07,
|
|
"loss": 0.003,
|
|
"num_tokens": 97933083.0,
|
|
"reward": 1.3517411947250366,
|
|
"reward_std": 0.1127757877111435,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.7498577833175659,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7821072340011597,
|
|
"step": 200
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 440.0,
|
|
"completions/max_terminated_length": 440.0,
|
|
"completions/mean_length": 277.74609375,
|
|
"completions/mean_terminated_length": 277.74609375,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 0.3216,
|
|
"grad_norm": 0.026896316558122635,
|
|
"learning_rate": 7.544483985765125e-07,
|
|
"loss": 0.0045,
|
|
"num_tokens": 98407330.0,
|
|
"reward": 1.240262508392334,
|
|
"reward_std": 0.16174383461475372,
|
|
"rewards/accuracy_reward_long_step": 0.35546875,
|
|
"rewards/final_brier_reward_long_step": 0.798291027545929,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7408840656280518,
|
|
"step": 201
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 553.0,
|
|
"completions/max_terminated_length": 553.0,
|
|
"completions/mean_length": 261.98828125,
|
|
"completions/mean_terminated_length": 261.98828125,
|
|
"completions/min_length": 133.0,
|
|
"completions/min_terminated_length": 133.0,
|
|
"epoch": 0.3232,
|
|
"grad_norm": 0.027860237285494804,
|
|
"learning_rate": 7.526690391459074e-07,
|
|
"loss": 0.0018,
|
|
"num_tokens": 98894503.0,
|
|
"reward": 1.223290205001831,
|
|
"reward_std": 0.15445607900619507,
|
|
"rewards/accuracy_reward_long_step": 0.3515625,
|
|
"rewards/final_brier_reward_long_step": 0.7443945407867432,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7425163984298706,
|
|
"step": 202
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 481.0,
|
|
"completions/max_terminated_length": 481.0,
|
|
"completions/mean_length": 274.48828125,
|
|
"completions/mean_terminated_length": 274.48828125,
|
|
"completions/min_length": 151.0,
|
|
"completions/min_terminated_length": 151.0,
|
|
"epoch": 0.3248,
|
|
"grad_norm": 0.02505328133702278,
|
|
"learning_rate": 7.508896797153025e-07,
|
|
"loss": -0.0032,
|
|
"num_tokens": 99377636.0,
|
|
"reward": 1.4042761325836182,
|
|
"reward_std": 0.2108316421508789,
|
|
"rewards/accuracy_reward_long_step": 0.5078125,
|
|
"rewards/final_brier_reward_long_step": 0.7737988233566284,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8120555877685547,
|
|
"step": 203
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 528.0,
|
|
"completions/max_terminated_length": 528.0,
|
|
"completions/mean_length": 271.3203125,
|
|
"completions/mean_terminated_length": 271.3203125,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.3264,
|
|
"grad_norm": 0.027994032949209213,
|
|
"learning_rate": 7.491103202846974e-07,
|
|
"loss": 0.0018,
|
|
"num_tokens": 99877070.0,
|
|
"reward": 1.4199368953704834,
|
|
"reward_std": 0.1661403775215149,
|
|
"rewards/accuracy_reward_long_step": 0.53125,
|
|
"rewards/final_brier_reward_long_step": 0.7513816356658936,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8033663034439087,
|
|
"step": 204
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 485.0,
|
|
"completions/max_terminated_length": 485.0,
|
|
"completions/mean_length": 265.5625,
|
|
"completions/mean_terminated_length": 266.60394287109375,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 149.0,
|
|
"epoch": 0.328,
|
|
"grad_norm": 0.02644249238073826,
|
|
"learning_rate": 7.473309608540925e-07,
|
|
"loss": -0.0023,
|
|
"num_tokens": 100362006.0,
|
|
"reward": 1.4836362600326538,
|
|
"reward_std": 0.2127230167388916,
|
|
"rewards/accuracy_reward_long_step": 0.5859375,
|
|
"rewards/final_brier_reward_long_step": 0.7705457210540771,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.828061580657959,
|
|
"step": 205
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 429.0,
|
|
"completions/max_terminated_length": 429.0,
|
|
"completions/mean_length": 260.26953125,
|
|
"completions/mean_terminated_length": 260.26953125,
|
|
"completions/min_length": 129.0,
|
|
"completions/min_terminated_length": 129.0,
|
|
"epoch": 0.3296,
|
|
"grad_norm": 0.02795090340077877,
|
|
"learning_rate": 7.455516014234874e-07,
|
|
"loss": 0.0014,
|
|
"num_tokens": 100836259.0,
|
|
"reward": 1.4013100862503052,
|
|
"reward_std": 0.17857375741004944,
|
|
"rewards/accuracy_reward_long_step": 0.53125,
|
|
"rewards/final_brier_reward_long_step": 0.7368500232696533,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7433903217315674,
|
|
"step": 206
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 463.0,
|
|
"completions/max_terminated_length": 463.0,
|
|
"completions/mean_length": 257.23828125,
|
|
"completions/mean_terminated_length": 257.23828125,
|
|
"completions/min_length": 138.0,
|
|
"completions/min_terminated_length": 138.0,
|
|
"epoch": 0.3312,
|
|
"grad_norm": 0.028254900127649307,
|
|
"learning_rate": 7.437722419928826e-07,
|
|
"loss": 0.0146,
|
|
"num_tokens": 101316480.0,
|
|
"reward": 1.4756114482879639,
|
|
"reward_std": 0.21930184960365295,
|
|
"rewards/accuracy_reward_long_step": 0.60546875,
|
|
"rewards/final_brier_reward_long_step": 0.7034816741943359,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7770892381668091,
|
|
"step": 207
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 799.0,
|
|
"completions/max_terminated_length": 799.0,
|
|
"completions/mean_length": 275.65625,
|
|
"completions/mean_terminated_length": 275.65625,
|
|
"completions/min_length": 181.0,
|
|
"completions/min_terminated_length": 181.0,
|
|
"epoch": 0.3328,
|
|
"grad_norm": 0.02601105347275734,
|
|
"learning_rate": 7.419928825622776e-07,
|
|
"loss": -0.0054,
|
|
"num_tokens": 101820640.0,
|
|
"reward": 1.4046986103057861,
|
|
"reward_std": 0.1485091894865036,
|
|
"rewards/accuracy_reward_long_step": 0.53125,
|
|
"rewards/final_brier_reward_long_step": 0.7158496379852295,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7779449224472046,
|
|
"step": 208
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 547.0,
|
|
"completions/max_terminated_length": 547.0,
|
|
"completions/mean_length": 275.76953125,
|
|
"completions/mean_terminated_length": 275.76953125,
|
|
"completions/min_length": 168.0,
|
|
"completions/min_terminated_length": 168.0,
|
|
"epoch": 0.3344,
|
|
"grad_norm": 0.027420159429311752,
|
|
"learning_rate": 7.402135231316725e-07,
|
|
"loss": 0.0148,
|
|
"num_tokens": 102301669.0,
|
|
"reward": 1.3544528484344482,
|
|
"reward_std": 0.12338382005691528,
|
|
"rewards/accuracy_reward_long_step": 0.45703125,
|
|
"rewards/final_brier_reward_long_step": 0.750058650970459,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8396276235580444,
|
|
"step": 209
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 421.0,
|
|
"completions/max_terminated_length": 421.0,
|
|
"completions/mean_length": 260.109375,
|
|
"completions/mean_terminated_length": 260.109375,
|
|
"completions/min_length": 150.0,
|
|
"completions/min_terminated_length": 150.0,
|
|
"epoch": 0.336,
|
|
"grad_norm": 0.027533669024705887,
|
|
"learning_rate": 7.384341637010676e-07,
|
|
"loss": -0.0076,
|
|
"num_tokens": 102791329.0,
|
|
"reward": 1.4349637031555176,
|
|
"reward_std": 0.23317797482013702,
|
|
"rewards/accuracy_reward_long_step": 0.53125,
|
|
"rewards/final_brier_reward_long_step": 0.7997035384178162,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8151513338088989,
|
|
"step": 210
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 522.0,
|
|
"completions/max_terminated_length": 522.0,
|
|
"completions/mean_length": 262.828125,
|
|
"completions/mean_terminated_length": 262.828125,
|
|
"completions/min_length": 144.0,
|
|
"completions/min_terminated_length": 144.0,
|
|
"epoch": 0.3376,
|
|
"grad_norm": 0.028309568762779236,
|
|
"learning_rate": 7.366548042704625e-07,
|
|
"loss": 0.0238,
|
|
"num_tokens": 103289013.0,
|
|
"reward": 1.3645201921463013,
|
|
"reward_std": 0.20173460245132446,
|
|
"rewards/accuracy_reward_long_step": 0.4609375,
|
|
"rewards/final_brier_reward_long_step": 0.8263086080551147,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.788021981716156,
|
|
"step": 211
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 422.0,
|
|
"completions/max_terminated_length": 422.0,
|
|
"completions/mean_length": 257.796875,
|
|
"completions/mean_terminated_length": 257.796875,
|
|
"completions/min_length": 137.0,
|
|
"completions/min_terminated_length": 137.0,
|
|
"epoch": 0.3392,
|
|
"grad_norm": 0.02732851170003414,
|
|
"learning_rate": 7.348754448398576e-07,
|
|
"loss": -0.005,
|
|
"num_tokens": 103776305.0,
|
|
"reward": 1.3025028705596924,
|
|
"reward_std": 0.19672125577926636,
|
|
"rewards/accuracy_reward_long_step": 0.41796875,
|
|
"rewards/final_brier_reward_long_step": 0.7558465003967285,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7822898626327515,
|
|
"step": 212
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 427.0,
|
|
"completions/max_terminated_length": 427.0,
|
|
"completions/mean_length": 252.21484375,
|
|
"completions/mean_terminated_length": 252.21484375,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 0.3408,
|
|
"grad_norm": 0.02799782156944275,
|
|
"learning_rate": 7.330960854092527e-07,
|
|
"loss": -0.0066,
|
|
"num_tokens": 104269640.0,
|
|
"reward": 1.3634313344955444,
|
|
"reward_std": 0.23930571973323822,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.72954922914505,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.810113787651062,
|
|
"step": 213
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 478.0,
|
|
"completions/max_terminated_length": 478.0,
|
|
"completions/mean_length": 251.56640625,
|
|
"completions/mean_terminated_length": 251.56640625,
|
|
"completions/min_length": 156.0,
|
|
"completions/min_terminated_length": 156.0,
|
|
"epoch": 0.3424,
|
|
"grad_norm": 0.029079895466566086,
|
|
"learning_rate": 7.313167259786477e-07,
|
|
"loss": -0.0028,
|
|
"num_tokens": 104767105.0,
|
|
"reward": 1.3156641721725464,
|
|
"reward_std": 0.15963619947433472,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.7907624840736389,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8156442642211914,
|
|
"step": 214
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 440.0,
|
|
"completions/max_terminated_length": 440.0,
|
|
"completions/mean_length": 248.34375,
|
|
"completions/mean_terminated_length": 248.34375,
|
|
"completions/min_length": 101.0,
|
|
"completions/min_terminated_length": 101.0,
|
|
"epoch": 0.344,
|
|
"grad_norm": 0.029702944681048393,
|
|
"learning_rate": 7.295373665480427e-07,
|
|
"loss": 0.0191,
|
|
"num_tokens": 105264905.0,
|
|
"reward": 1.2511861324310303,
|
|
"reward_std": 0.13276691734790802,
|
|
"rewards/accuracy_reward_long_step": 0.375,
|
|
"rewards/final_brier_reward_long_step": 0.7566503286361694,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7559068202972412,
|
|
"step": 215
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 382.0,
|
|
"completions/max_terminated_length": 382.0,
|
|
"completions/mean_length": 240.96875,
|
|
"completions/mean_terminated_length": 240.96875,
|
|
"completions/min_length": 129.0,
|
|
"completions/min_terminated_length": 129.0,
|
|
"epoch": 0.3456,
|
|
"grad_norm": 0.02834567055106163,
|
|
"learning_rate": 7.277580071174377e-07,
|
|
"loss": -0.0024,
|
|
"num_tokens": 105756457.0,
|
|
"reward": 1.4394803047180176,
|
|
"reward_std": 0.2180296927690506,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.7011808753013611,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8067399859428406,
|
|
"step": 216
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 489.0,
|
|
"completions/max_terminated_length": 489.0,
|
|
"completions/mean_length": 247.52734375,
|
|
"completions/mean_terminated_length": 247.52734375,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.3472,
|
|
"grad_norm": 0.027301594614982605,
|
|
"learning_rate": 7.259786476868327e-07,
|
|
"loss": 0.0056,
|
|
"num_tokens": 106254728.0,
|
|
"reward": 1.3793138265609741,
|
|
"reward_std": 0.16131410002708435,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.7766991853713989,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.818681001663208,
|
|
"step": 217
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 444.0,
|
|
"completions/max_terminated_length": 444.0,
|
|
"completions/mean_length": 246.4140625,
|
|
"completions/mean_terminated_length": 246.4140625,
|
|
"completions/min_length": 141.0,
|
|
"completions/min_terminated_length": 141.0,
|
|
"epoch": 0.3488,
|
|
"grad_norm": 0.02674778178334236,
|
|
"learning_rate": 7.241992882562277e-07,
|
|
"loss": -0.0079,
|
|
"num_tokens": 106746986.0,
|
|
"reward": 1.3636713027954102,
|
|
"reward_std": 0.15767797827720642,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.743729293346405,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7734558582305908,
|
|
"step": 218
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 461.0,
|
|
"completions/max_terminated_length": 461.0,
|
|
"completions/mean_length": 250.6796875,
|
|
"completions/mean_terminated_length": 250.6796875,
|
|
"completions/min_length": 159.0,
|
|
"completions/min_terminated_length": 159.0,
|
|
"epoch": 0.3504,
|
|
"grad_norm": 0.028994156047701836,
|
|
"learning_rate": 7.224199288256227e-07,
|
|
"loss": -0.015,
|
|
"num_tokens": 107246088.0,
|
|
"reward": 1.3627524375915527,
|
|
"reward_std": 0.21599024534225464,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.7811144590377808,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8027076721191406,
|
|
"step": 219
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 400.0,
|
|
"completions/max_terminated_length": 400.0,
|
|
"completions/mean_length": 238.0390625,
|
|
"completions/mean_terminated_length": 238.0390625,
|
|
"completions/min_length": 156.0,
|
|
"completions/min_terminated_length": 156.0,
|
|
"epoch": 0.352,
|
|
"grad_norm": 0.03368399292230606,
|
|
"learning_rate": 7.206405693950178e-07,
|
|
"loss": -0.0043,
|
|
"num_tokens": 107737026.0,
|
|
"reward": 1.3865300416946411,
|
|
"reward_std": 0.20244070887565613,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.7735304236412048,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8350895047187805,
|
|
"step": 220
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 420.0,
|
|
"completions/max_terminated_length": 420.0,
|
|
"completions/mean_length": 234.5234375,
|
|
"completions/mean_terminated_length": 234.5234375,
|
|
"completions/min_length": 140.0,
|
|
"completions/min_terminated_length": 140.0,
|
|
"epoch": 0.3536,
|
|
"grad_norm": 0.030342837795615196,
|
|
"learning_rate": 7.188612099644128e-07,
|
|
"loss": 0.0076,
|
|
"num_tokens": 108207696.0,
|
|
"reward": 1.3147838115692139,
|
|
"reward_std": 0.14057135581970215,
|
|
"rewards/accuracy_reward_long_step": 0.4453125,
|
|
"rewards/final_brier_reward_long_step": 0.7470492124557495,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7386487722396851,
|
|
"step": 221
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 432.0,
|
|
"completions/max_terminated_length": 432.0,
|
|
"completions/mean_length": 243.98046875,
|
|
"completions/mean_terminated_length": 243.98046875,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.3552,
|
|
"grad_norm": 0.028509140014648438,
|
|
"learning_rate": 7.170818505338078e-07,
|
|
"loss": 0.0073,
|
|
"num_tokens": 108685563.0,
|
|
"reward": 1.4801634550094604,
|
|
"reward_std": 0.2189689725637436,
|
|
"rewards/accuracy_reward_long_step": 0.57421875,
|
|
"rewards/final_brier_reward_long_step": 0.7850944995880127,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8386842608451843,
|
|
"step": 222
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 393.0,
|
|
"completions/max_terminated_length": 393.0,
|
|
"completions/mean_length": 234.9921875,
|
|
"completions/mean_terminated_length": 234.9921875,
|
|
"completions/min_length": 115.0,
|
|
"completions/min_terminated_length": 115.0,
|
|
"epoch": 0.3568,
|
|
"grad_norm": 0.02966553531587124,
|
|
"learning_rate": 7.153024911032028e-07,
|
|
"loss": 0.0007,
|
|
"num_tokens": 109167289.0,
|
|
"reward": 1.377956509590149,
|
|
"reward_std": 0.15985409915447235,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.7817855477333069,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8081655502319336,
|
|
"step": 223
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 383.0,
|
|
"completions/max_terminated_length": 383.0,
|
|
"completions/mean_length": 231.79296875,
|
|
"completions/mean_terminated_length": 231.79296875,
|
|
"completions/min_length": 108.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.3584,
|
|
"grad_norm": 0.030611420050263405,
|
|
"learning_rate": 7.135231316725978e-07,
|
|
"loss": 0.0122,
|
|
"num_tokens": 109655324.0,
|
|
"reward": 1.3960628509521484,
|
|
"reward_std": 0.11526073515415192,
|
|
"rewards/accuracy_reward_long_step": 0.4609375,
|
|
"rewards/final_brier_reward_long_step": 0.865734338760376,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8747667074203491,
|
|
"step": 224
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 424.0,
|
|
"completions/max_terminated_length": 424.0,
|
|
"completions/mean_length": 239.73828125,
|
|
"completions/mean_terminated_length": 239.73828125,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 0.36,
|
|
"grad_norm": 0.032648514956235886,
|
|
"learning_rate": 7.117437722419929e-07,
|
|
"loss": 0.0134,
|
|
"num_tokens": 110142809.0,
|
|
"reward": 1.300749659538269,
|
|
"reward_std": 0.20763415098190308,
|
|
"rewards/accuracy_reward_long_step": 0.41796875,
|
|
"rewards/final_brier_reward_long_step": 0.7577574253082275,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7733659744262695,
|
|
"step": 225
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 480.0,
|
|
"completions/max_terminated_length": 480.0,
|
|
"completions/mean_length": 251.1875,
|
|
"completions/mean_terminated_length": 251.1875,
|
|
"completions/min_length": 151.0,
|
|
"completions/min_terminated_length": 151.0,
|
|
"epoch": 0.3616,
|
|
"grad_norm": 0.027327539399266243,
|
|
"learning_rate": 7.099644128113878e-07,
|
|
"loss": 0.0023,
|
|
"num_tokens": 110646393.0,
|
|
"reward": 1.5032904148101807,
|
|
"reward_std": 0.1392737776041031,
|
|
"rewards/accuracy_reward_long_step": 0.5859375,
|
|
"rewards/final_brier_reward_long_step": 0.8319687843322754,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8374428749084473,
|
|
"step": 226
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 406.0,
|
|
"completions/max_terminated_length": 406.0,
|
|
"completions/mean_length": 248.3046875,
|
|
"completions/mean_terminated_length": 248.3046875,
|
|
"completions/min_length": 112.0,
|
|
"completions/min_terminated_length": 112.0,
|
|
"epoch": 0.3632,
|
|
"grad_norm": 0.028683941811323166,
|
|
"learning_rate": 7.08185053380783e-07,
|
|
"loss": -0.0104,
|
|
"num_tokens": 111139431.0,
|
|
"reward": 1.2084152698516846,
|
|
"reward_std": 0.1405543088912964,
|
|
"rewards/accuracy_reward_long_step": 0.30859375,
|
|
"rewards/final_brier_reward_long_step": 0.7831394672393799,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8161464929580688,
|
|
"step": 227
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 369.0,
|
|
"completions/max_terminated_length": 369.0,
|
|
"completions/mean_length": 248.3671875,
|
|
"completions/mean_terminated_length": 248.3671875,
|
|
"completions/min_length": 142.0,
|
|
"completions/min_terminated_length": 142.0,
|
|
"epoch": 0.3648,
|
|
"grad_norm": 0.02712222747504711,
|
|
"learning_rate": 7.064056939501779e-07,
|
|
"loss": -0.006,
|
|
"num_tokens": 111635245.0,
|
|
"reward": 1.341683030128479,
|
|
"reward_std": 0.18412762880325317,
|
|
"rewards/accuracy_reward_long_step": 0.44140625,
|
|
"rewards/final_brier_reward_long_step": 0.7527234554290771,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8483837842941284,
|
|
"step": 228
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 457.0,
|
|
"completions/max_terminated_length": 457.0,
|
|
"completions/mean_length": 242.3046875,
|
|
"completions/mean_terminated_length": 242.3046875,
|
|
"completions/min_length": 144.0,
|
|
"completions/min_terminated_length": 144.0,
|
|
"epoch": 0.3664,
|
|
"grad_norm": 0.028868133202195168,
|
|
"learning_rate": 7.046263345195729e-07,
|
|
"loss": 0.0127,
|
|
"num_tokens": 112127091.0,
|
|
"reward": 1.4517192840576172,
|
|
"reward_std": 0.16360458731651306,
|
|
"rewards/accuracy_reward_long_step": 0.578125,
|
|
"rewards/final_brier_reward_long_step": 0.7802172303199768,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7219727039337158,
|
|
"step": 229
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 444.0,
|
|
"completions/max_terminated_length": 444.0,
|
|
"completions/mean_length": 249.1953125,
|
|
"completions/mean_terminated_length": 249.1953125,
|
|
"completions/min_length": 150.0,
|
|
"completions/min_terminated_length": 150.0,
|
|
"epoch": 0.368,
|
|
"grad_norm": 0.030503099784255028,
|
|
"learning_rate": 7.028469750889679e-07,
|
|
"loss": 0.0012,
|
|
"num_tokens": 112615557.0,
|
|
"reward": 1.2901397943496704,
|
|
"reward_std": 0.1938847005367279,
|
|
"rewards/accuracy_reward_long_step": 0.43359375,
|
|
"rewards/final_brier_reward_long_step": 0.6676468849182129,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7585374116897583,
|
|
"step": 230
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 422.0,
|
|
"completions/max_terminated_length": 422.0,
|
|
"completions/mean_length": 236.74609375,
|
|
"completions/mean_terminated_length": 236.74609375,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.3696,
|
|
"grad_norm": 0.031112175434827805,
|
|
"learning_rate": 7.010676156583629e-07,
|
|
"loss": 0.0072,
|
|
"num_tokens": 113107308.0,
|
|
"reward": 1.3856728076934814,
|
|
"reward_std": 0.1376914530992508,
|
|
"rewards/accuracy_reward_long_step": 0.546875,
|
|
"rewards/final_brier_reward_long_step": 0.6620769500732422,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.693114161491394,
|
|
"step": 231
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 427.0,
|
|
"completions/max_terminated_length": 427.0,
|
|
"completions/mean_length": 235.6640625,
|
|
"completions/mean_terminated_length": 235.6640625,
|
|
"completions/min_length": 141.0,
|
|
"completions/min_terminated_length": 141.0,
|
|
"epoch": 0.3712,
|
|
"grad_norm": 0.028059890493750572,
|
|
"learning_rate": 6.99288256227758e-07,
|
|
"loss": -0.0005,
|
|
"num_tokens": 113587182.0,
|
|
"reward": 1.4213546514511108,
|
|
"reward_std": 0.2364693284034729,
|
|
"rewards/accuracy_reward_long_step": 0.5546875,
|
|
"rewards/final_brier_reward_long_step": 0.7457855343818665,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7208831310272217,
|
|
"step": 232
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 428.0,
|
|
"completions/max_terminated_length": 428.0,
|
|
"completions/mean_length": 235.29296875,
|
|
"completions/mean_terminated_length": 235.29296875,
|
|
"completions/min_length": 68.0,
|
|
"completions/min_terminated_length": 68.0,
|
|
"epoch": 0.3728,
|
|
"grad_norm": 0.027861230075359344,
|
|
"learning_rate": 6.975088967971529e-07,
|
|
"loss": 0.0032,
|
|
"num_tokens": 114059153.0,
|
|
"reward": 1.3009544610977173,
|
|
"reward_std": 0.12013030052185059,
|
|
"rewards/accuracy_reward_long_step": 0.38671875,
|
|
"rewards/final_brier_reward_long_step": 0.8454800844192505,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8192753195762634,
|
|
"step": 233
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 429.0,
|
|
"completions/max_terminated_length": 429.0,
|
|
"completions/mean_length": 238.9921875,
|
|
"completions/mean_terminated_length": 238.9921875,
|
|
"completions/min_length": 152.0,
|
|
"completions/min_terminated_length": 152.0,
|
|
"epoch": 0.3744,
|
|
"grad_norm": 0.02792465314269066,
|
|
"learning_rate": 6.957295373665481e-07,
|
|
"loss": -0.008,
|
|
"num_tokens": 114529591.0,
|
|
"reward": 1.4602546691894531,
|
|
"reward_std": 0.13010551035404205,
|
|
"rewards/accuracy_reward_long_step": 0.57421875,
|
|
"rewards/final_brier_reward_long_step": 0.7742418050765991,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7699018716812134,
|
|
"step": 234
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 378.0,
|
|
"completions/max_terminated_length": 378.0,
|
|
"completions/mean_length": 234.44140625,
|
|
"completions/mean_terminated_length": 234.44140625,
|
|
"completions/min_length": 144.0,
|
|
"completions/min_terminated_length": 144.0,
|
|
"epoch": 0.376,
|
|
"grad_norm": 0.029247693717479706,
|
|
"learning_rate": 6.93950177935943e-07,
|
|
"loss": 0.0089,
|
|
"num_tokens": 115003176.0,
|
|
"reward": 1.4750906229019165,
|
|
"reward_std": 0.14190274477005005,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.8098050951957703,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8405575752258301,
|
|
"step": 235
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 391.0,
|
|
"completions/max_terminated_length": 391.0,
|
|
"completions/mean_length": 234.3203125,
|
|
"completions/mean_terminated_length": 234.3203125,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.3776,
|
|
"grad_norm": 0.04604804888367653,
|
|
"learning_rate": 6.921708185053381e-07,
|
|
"loss": -0.0091,
|
|
"num_tokens": 115499370.0,
|
|
"reward": 1.253650426864624,
|
|
"reward_std": 0.1829456090927124,
|
|
"rewards/accuracy_reward_long_step": 0.36328125,
|
|
"rewards/final_brier_reward_long_step": 0.8057793378829956,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7556971311569214,
|
|
"step": 236
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 449.0,
|
|
"completions/max_terminated_length": 449.0,
|
|
"completions/mean_length": 244.71875,
|
|
"completions/mean_terminated_length": 244.71875,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.3792,
|
|
"grad_norm": 0.031333666294813156,
|
|
"learning_rate": 6.903914590747331e-07,
|
|
"loss": 0.0144,
|
|
"num_tokens": 115977570.0,
|
|
"reward": 1.4031198024749756,
|
|
"reward_std": 0.1869141310453415,
|
|
"rewards/accuracy_reward_long_step": 0.50390625,
|
|
"rewards/final_brier_reward_long_step": 0.771310567855835,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8255437612533569,
|
|
"step": 237
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 420.0,
|
|
"completions/max_terminated_length": 420.0,
|
|
"completions/mean_length": 249.234375,
|
|
"completions/mean_terminated_length": 249.234375,
|
|
"completions/min_length": 95.0,
|
|
"completions/min_terminated_length": 95.0,
|
|
"epoch": 0.3808,
|
|
"grad_norm": 0.028639167547225952,
|
|
"learning_rate": 6.88612099644128e-07,
|
|
"loss": 0.0002,
|
|
"num_tokens": 116465086.0,
|
|
"reward": 1.3048759698867798,
|
|
"reward_std": 0.18162578344345093,
|
|
"rewards/accuracy_reward_long_step": 0.43359375,
|
|
"rewards/final_brier_reward_long_step": 0.7125644683837891,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7725646495819092,
|
|
"step": 238
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 452.0,
|
|
"completions/max_terminated_length": 452.0,
|
|
"completions/mean_length": 246.1953125,
|
|
"completions/mean_terminated_length": 246.1953125,
|
|
"completions/min_length": 144.0,
|
|
"completions/min_terminated_length": 144.0,
|
|
"epoch": 0.3824,
|
|
"grad_norm": 0.029338406398892403,
|
|
"learning_rate": 6.868327402135231e-07,
|
|
"loss": 0.0053,
|
|
"num_tokens": 116931320.0,
|
|
"reward": 1.4342849254608154,
|
|
"reward_std": 0.1952168196439743,
|
|
"rewards/accuracy_reward_long_step": 0.5234375,
|
|
"rewards/final_brier_reward_long_step": 0.8456206917762756,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.79776930809021,
|
|
"step": 239
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 494.0,
|
|
"completions/max_terminated_length": 494.0,
|
|
"completions/mean_length": 244.49609375,
|
|
"completions/mean_terminated_length": 244.49609375,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.384,
|
|
"grad_norm": 0.02827758900821209,
|
|
"learning_rate": 6.85053380782918e-07,
|
|
"loss": -0.0059,
|
|
"num_tokens": 117435631.0,
|
|
"reward": 1.448837399482727,
|
|
"reward_std": 0.13833093643188477,
|
|
"rewards/accuracy_reward_long_step": 0.55078125,
|
|
"rewards/final_brier_reward_long_step": 0.8106515407562256,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7815728187561035,
|
|
"step": 240
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 408.0,
|
|
"completions/max_terminated_length": 408.0,
|
|
"completions/mean_length": 245.53515625,
|
|
"completions/mean_terminated_length": 245.53515625,
|
|
"completions/min_length": 140.0,
|
|
"completions/min_terminated_length": 140.0,
|
|
"epoch": 0.3856,
|
|
"grad_norm": 0.04907617345452309,
|
|
"learning_rate": 6.832740213523132e-07,
|
|
"loss": 0.0111,
|
|
"num_tokens": 117927904.0,
|
|
"reward": 1.2637048959732056,
|
|
"reward_std": 0.19578373432159424,
|
|
"rewards/accuracy_reward_long_step": 0.375,
|
|
"rewards/final_brier_reward_long_step": 0.7642871141433716,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7905321717262268,
|
|
"step": 241
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 395.0,
|
|
"completions/max_terminated_length": 395.0,
|
|
"completions/mean_length": 247.26953125,
|
|
"completions/mean_terminated_length": 247.26953125,
|
|
"completions/min_length": 112.0,
|
|
"completions/min_terminated_length": 112.0,
|
|
"epoch": 0.3872,
|
|
"grad_norm": 0.02862401306629181,
|
|
"learning_rate": 6.814946619217081e-07,
|
|
"loss": 0.0034,
|
|
"num_tokens": 118409677.0,
|
|
"reward": 1.4095513820648193,
|
|
"reward_std": 0.16092461347579956,
|
|
"rewards/accuracy_reward_long_step": 0.5078125,
|
|
"rewards/final_brier_reward_long_step": 0.7932863235473633,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8136688470840454,
|
|
"step": 242
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 401.0,
|
|
"completions/max_terminated_length": 401.0,
|
|
"completions/mean_length": 245.3828125,
|
|
"completions/mean_terminated_length": 245.3828125,
|
|
"completions/min_length": 143.0,
|
|
"completions/min_terminated_length": 143.0,
|
|
"epoch": 0.3888,
|
|
"grad_norm": 0.02867737039923668,
|
|
"learning_rate": 6.797153024911032e-07,
|
|
"loss": -0.0028,
|
|
"num_tokens": 118886583.0,
|
|
"reward": 1.4647985696792603,
|
|
"reward_std": 0.22731342911720276,
|
|
"rewards/accuracy_reward_long_step": 0.58984375,
|
|
"rewards/final_brier_reward_long_step": 0.7210452556610107,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7787743210792542,
|
|
"step": 243
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 423.0,
|
|
"completions/max_terminated_length": 423.0,
|
|
"completions/mean_length": 241.7578125,
|
|
"completions/mean_terminated_length": 241.7578125,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.3904,
|
|
"grad_norm": 0.029156368225812912,
|
|
"learning_rate": 6.779359430604982e-07,
|
|
"loss": -0.005,
|
|
"num_tokens": 119362457.0,
|
|
"reward": 1.4701181650161743,
|
|
"reward_std": 0.18407407402992249,
|
|
"rewards/accuracy_reward_long_step": 0.58984375,
|
|
"rewards/final_brier_reward_long_step": 0.7508969306945801,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7702009081840515,
|
|
"step": 244
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 452.0,
|
|
"completions/max_terminated_length": 452.0,
|
|
"completions/mean_length": 255.76171875,
|
|
"completions/mean_terminated_length": 255.76171875,
|
|
"completions/min_length": 108.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.392,
|
|
"grad_norm": 0.02922751009464264,
|
|
"learning_rate": 6.761565836298932e-07,
|
|
"loss": -0.0079,
|
|
"num_tokens": 119859300.0,
|
|
"reward": 1.2958691120147705,
|
|
"reward_std": 0.19874346256256104,
|
|
"rewards/accuracy_reward_long_step": 0.390625,
|
|
"rewards/final_brier_reward_long_step": 0.7741097807884216,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.84686678647995,
|
|
"step": 245
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 452.0,
|
|
"completions/max_terminated_length": 452.0,
|
|
"completions/mean_length": 253.25390625,
|
|
"completions/mean_terminated_length": 253.25390625,
|
|
"completions/min_length": 154.0,
|
|
"completions/min_terminated_length": 154.0,
|
|
"epoch": 0.3936,
|
|
"grad_norm": 0.029055269435048103,
|
|
"learning_rate": 6.743772241992882e-07,
|
|
"loss": 0.0013,
|
|
"num_tokens": 120346845.0,
|
|
"reward": 1.3145337104797363,
|
|
"reward_std": 0.1577182412147522,
|
|
"rewards/accuracy_reward_long_step": 0.42578125,
|
|
"rewards/final_brier_reward_long_step": 0.7263835668563843,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.828626275062561,
|
|
"step": 246
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 455.0,
|
|
"completions/max_terminated_length": 455.0,
|
|
"completions/mean_length": 247.15625,
|
|
"completions/mean_terminated_length": 247.15625,
|
|
"completions/min_length": 118.0,
|
|
"completions/min_terminated_length": 118.0,
|
|
"epoch": 0.3952,
|
|
"grad_norm": 0.030021749436855316,
|
|
"learning_rate": 6.725978647686833e-07,
|
|
"loss": -0.0072,
|
|
"num_tokens": 120849013.0,
|
|
"reward": 1.3015249967575073,
|
|
"reward_std": 0.15772968530654907,
|
|
"rewards/accuracy_reward_long_step": 0.41015625,
|
|
"rewards/final_brier_reward_long_step": 0.793542206287384,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7797452211380005,
|
|
"step": 247
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 461.0,
|
|
"completions/max_terminated_length": 461.0,
|
|
"completions/mean_length": 246.453125,
|
|
"completions/mean_terminated_length": 246.453125,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.3968,
|
|
"grad_norm": 0.03100651502609253,
|
|
"learning_rate": 6.708185053380783e-07,
|
|
"loss": -0.0017,
|
|
"num_tokens": 121339545.0,
|
|
"reward": 1.3638386726379395,
|
|
"reward_std": 0.18921023607254028,
|
|
"rewards/accuracy_reward_long_step": 0.46484375,
|
|
"rewards/final_brier_reward_long_step": 0.7788769602775574,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8171026110649109,
|
|
"step": 248
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 459.0,
|
|
"completions/max_terminated_length": 459.0,
|
|
"completions/mean_length": 252.19140625,
|
|
"completions/mean_terminated_length": 252.19140625,
|
|
"completions/min_length": 118.0,
|
|
"completions/min_terminated_length": 118.0,
|
|
"epoch": 0.3984,
|
|
"grad_norm": 0.029122378677129745,
|
|
"learning_rate": 6.690391459074733e-07,
|
|
"loss": 0.0026,
|
|
"num_tokens": 121826858.0,
|
|
"reward": 1.3321928977966309,
|
|
"reward_std": 0.18050938844680786,
|
|
"rewards/accuracy_reward_long_step": 0.4296875,
|
|
"rewards/final_brier_reward_long_step": 0.7863625288009644,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.82365882396698,
|
|
"step": 249
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 401.0,
|
|
"completions/max_terminated_length": 401.0,
|
|
"completions/mean_length": 255.10546875,
|
|
"completions/mean_terminated_length": 255.10546875,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.4,
|
|
"grad_norm": 0.02999301441013813,
|
|
"learning_rate": 6.672597864768683e-07,
|
|
"loss": -0.0024,
|
|
"num_tokens": 122323669.0,
|
|
"reward": 1.5447840690612793,
|
|
"reward_std": 0.1723225712776184,
|
|
"rewards/accuracy_reward_long_step": 0.66015625,
|
|
"rewards/final_brier_reward_long_step": 0.7203683853149414,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8181428909301758,
|
|
"step": 250
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 440.0,
|
|
"completions/max_terminated_length": 440.0,
|
|
"completions/mean_length": 263.68359375,
|
|
"completions/mean_terminated_length": 263.68359375,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.4016,
|
|
"grad_norm": 0.03101922571659088,
|
|
"learning_rate": 6.654804270462633e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 122815420.0,
|
|
"reward": 1.2982118129730225,
|
|
"reward_std": 0.22702577710151672,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.7307562828063965,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8058412075042725,
|
|
"step": 251
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 432.0,
|
|
"completions/max_terminated_length": 432.0,
|
|
"completions/mean_length": 269.63671875,
|
|
"completions/mean_terminated_length": 269.63671875,
|
|
"completions/min_length": 141.0,
|
|
"completions/min_terminated_length": 141.0,
|
|
"epoch": 0.4032,
|
|
"grad_norm": 0.028656797483563423,
|
|
"learning_rate": 6.637010676156583e-07,
|
|
"loss": -0.0024,
|
|
"num_tokens": 123311063.0,
|
|
"reward": 1.2810194492340088,
|
|
"reward_std": 0.16011501848697662,
|
|
"rewards/accuracy_reward_long_step": 0.37109375,
|
|
"rewards/final_brier_reward_long_step": 0.7987828254699707,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8409198522567749,
|
|
"step": 252
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 425.0,
|
|
"completions/max_terminated_length": 425.0,
|
|
"completions/mean_length": 253.25,
|
|
"completions/mean_terminated_length": 253.25,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.4048,
|
|
"grad_norm": 0.030914753675460815,
|
|
"learning_rate": 6.619217081850533e-07,
|
|
"loss": 0.0152,
|
|
"num_tokens": 123774055.0,
|
|
"reward": 1.3488011360168457,
|
|
"reward_std": 0.12081344425678253,
|
|
"rewards/accuracy_reward_long_step": 0.44140625,
|
|
"rewards/final_brier_reward_long_step": 0.810867190361023,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8187124729156494,
|
|
"step": 253
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 429.0,
|
|
"completions/max_terminated_length": 429.0,
|
|
"completions/mean_length": 264.98046875,
|
|
"completions/mean_terminated_length": 264.98046875,
|
|
"completions/min_length": 120.0,
|
|
"completions/min_terminated_length": 120.0,
|
|
"epoch": 0.4064,
|
|
"grad_norm": 0.028002172708511353,
|
|
"learning_rate": 6.601423487544484e-07,
|
|
"loss": -0.0013,
|
|
"num_tokens": 124260066.0,
|
|
"reward": 1.4030770063400269,
|
|
"reward_std": 0.14950624108314514,
|
|
"rewards/accuracy_reward_long_step": 0.53125,
|
|
"rewards/final_brier_reward_long_step": 0.7564605474472046,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7308475971221924,
|
|
"step": 254
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 550.0,
|
|
"completions/max_terminated_length": 550.0,
|
|
"completions/mean_length": 267.8203125,
|
|
"completions/mean_terminated_length": 267.8203125,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.408,
|
|
"grad_norm": 0.027425022795796394,
|
|
"learning_rate": 6.583629893238434e-07,
|
|
"loss": -0.0113,
|
|
"num_tokens": 124759276.0,
|
|
"reward": 1.3036949634552002,
|
|
"reward_std": 0.22329337894916534,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.7808917760848999,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7776377201080322,
|
|
"step": 255
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 430.0,
|
|
"completions/max_terminated_length": 430.0,
|
|
"completions/mean_length": 256.765625,
|
|
"completions/mean_terminated_length": 256.765625,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.4096,
|
|
"grad_norm": 0.02852526493370533,
|
|
"learning_rate": 6.565836298932385e-07,
|
|
"loss": -0.0086,
|
|
"num_tokens": 125243288.0,
|
|
"reward": 1.462049961090088,
|
|
"reward_std": 0.17608040571212769,
|
|
"rewards/accuracy_reward_long_step": 0.6328125,
|
|
"rewards/final_brier_reward_long_step": 0.6412858963012695,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6756638288497925,
|
|
"step": 256
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 489.0,
|
|
"completions/max_terminated_length": 489.0,
|
|
"completions/mean_length": 267.80078125,
|
|
"completions/mean_terminated_length": 267.80078125,
|
|
"completions/min_length": 107.0,
|
|
"completions/min_terminated_length": 107.0,
|
|
"epoch": 0.4112,
|
|
"grad_norm": 0.030284898355603218,
|
|
"learning_rate": 6.548042704626334e-07,
|
|
"loss": -0.0072,
|
|
"num_tokens": 125741869.0,
|
|
"reward": 1.3154691457748413,
|
|
"reward_std": 0.2600463032722473,
|
|
"rewards/accuracy_reward_long_step": 0.44140625,
|
|
"rewards/final_brier_reward_long_step": 0.7620574235916138,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7341942191123962,
|
|
"step": 257
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 418.0,
|
|
"completions/max_terminated_length": 418.0,
|
|
"completions/mean_length": 262.48828125,
|
|
"completions/mean_terminated_length": 262.48828125,
|
|
"completions/min_length": 107.0,
|
|
"completions/min_terminated_length": 107.0,
|
|
"epoch": 0.4128,
|
|
"grad_norm": 0.03752259910106659,
|
|
"learning_rate": 6.530249110320284e-07,
|
|
"loss": -0.0087,
|
|
"num_tokens": 126234514.0,
|
|
"reward": 1.4143836498260498,
|
|
"reward_std": 0.14952951669692993,
|
|
"rewards/accuracy_reward_long_step": 0.51171875,
|
|
"rewards/final_brier_reward_long_step": 0.7889230251312256,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8217366933822632,
|
|
"step": 258
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 437.0,
|
|
"completions/max_terminated_length": 437.0,
|
|
"completions/mean_length": 274.88671875,
|
|
"completions/mean_terminated_length": 274.88671875,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.4144,
|
|
"grad_norm": 0.028582880273461342,
|
|
"learning_rate": 6.512455516014234e-07,
|
|
"loss": -0.0036,
|
|
"num_tokens": 126730069.0,
|
|
"reward": 1.4206815958023071,
|
|
"reward_std": 0.19087818264961243,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.7297155857086182,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8045732975006104,
|
|
"step": 259
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 478.0,
|
|
"completions/max_terminated_length": 478.0,
|
|
"completions/mean_length": 272.7578125,
|
|
"completions/mean_terminated_length": 272.7578125,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.416,
|
|
"grad_norm": 0.026567399501800537,
|
|
"learning_rate": 6.494661921708184e-07,
|
|
"loss": -0.0099,
|
|
"num_tokens": 127214927.0,
|
|
"reward": 1.3398463726043701,
|
|
"reward_std": 0.12067516893148422,
|
|
"rewards/accuracy_reward_long_step": 0.44921875,
|
|
"rewards/final_brier_reward_long_step": 0.860762894153595,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7017475366592407,
|
|
"step": 260
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 553.0,
|
|
"completions/max_terminated_length": 553.0,
|
|
"completions/mean_length": 277.7265625,
|
|
"completions/mean_terminated_length": 277.7265625,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.4176,
|
|
"grad_norm": 0.02695128507912159,
|
|
"learning_rate": 6.476868327402136e-07,
|
|
"loss": -0.001,
|
|
"num_tokens": 127710617.0,
|
|
"reward": 1.4976187944412231,
|
|
"reward_std": 0.14346104860305786,
|
|
"rewards/accuracy_reward_long_step": 0.60546875,
|
|
"rewards/final_brier_reward_long_step": 0.7943031191825867,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7742971181869507,
|
|
"step": 261
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 461.0,
|
|
"completions/max_terminated_length": 461.0,
|
|
"completions/mean_length": 273.38671875,
|
|
"completions/mean_terminated_length": 273.38671875,
|
|
"completions/min_length": 114.0,
|
|
"completions/min_terminated_length": 114.0,
|
|
"epoch": 0.4192,
|
|
"grad_norm": 0.02583600953221321,
|
|
"learning_rate": 6.459074733096085e-07,
|
|
"loss": -0.0104,
|
|
"num_tokens": 128212556.0,
|
|
"reward": 1.436043620109558,
|
|
"reward_std": 0.11342111974954605,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.8090195655822754,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.778904914855957,
|
|
"step": 262
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 486.0,
|
|
"completions/max_terminated_length": 486.0,
|
|
"completions/mean_length": 275.109375,
|
|
"completions/mean_terminated_length": 275.109375,
|
|
"completions/min_length": 114.0,
|
|
"completions/min_terminated_length": 114.0,
|
|
"epoch": 0.4208,
|
|
"grad_norm": 0.027538571506738663,
|
|
"learning_rate": 6.441281138790036e-07,
|
|
"loss": -0.0113,
|
|
"num_tokens": 128707720.0,
|
|
"reward": 1.2951858043670654,
|
|
"reward_std": 0.15389752388000488,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.7856941223144531,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.738798975944519,
|
|
"step": 263
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 587.0,
|
|
"completions/max_terminated_length": 587.0,
|
|
"completions/mean_length": 292.70703125,
|
|
"completions/mean_terminated_length": 292.70703125,
|
|
"completions/min_length": 140.0,
|
|
"completions/min_terminated_length": 140.0,
|
|
"epoch": 0.4224,
|
|
"grad_norm": 0.0262598879635334,
|
|
"learning_rate": 6.423487544483985e-07,
|
|
"loss": 0.017,
|
|
"num_tokens": 129207389.0,
|
|
"reward": 1.3416748046875,
|
|
"reward_std": 0.23134978115558624,
|
|
"rewards/accuracy_reward_long_step": 0.4375,
|
|
"rewards/final_brier_reward_long_step": 0.7901312112808228,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8343803882598877,
|
|
"step": 264
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 561.0,
|
|
"completions/max_terminated_length": 561.0,
|
|
"completions/mean_length": 288.3515625,
|
|
"completions/mean_terminated_length": 288.3515625,
|
|
"completions/min_length": 109.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 0.424,
|
|
"grad_norm": 0.030151214450597763,
|
|
"learning_rate": 6.405693950177936e-07,
|
|
"loss": 0.0096,
|
|
"num_tokens": 129708127.0,
|
|
"reward": 1.5116443634033203,
|
|
"reward_std": 0.15138523280620575,
|
|
"rewards/accuracy_reward_long_step": 0.61328125,
|
|
"rewards/final_brier_reward_long_step": 0.8109831809997559,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7824693918228149,
|
|
"step": 265
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 530.0,
|
|
"completions/max_terminated_length": 530.0,
|
|
"completions/mean_length": 285.359375,
|
|
"completions/mean_terminated_length": 285.359375,
|
|
"completions/min_length": 123.0,
|
|
"completions/min_terminated_length": 123.0,
|
|
"epoch": 0.4256,
|
|
"grad_norm": 0.030536562204360962,
|
|
"learning_rate": 6.387900355871885e-07,
|
|
"loss": 0.0139,
|
|
"num_tokens": 130207523.0,
|
|
"reward": 1.5717109441757202,
|
|
"reward_std": 0.1531601846218109,
|
|
"rewards/accuracy_reward_long_step": 0.671875,
|
|
"rewards/final_brier_reward_long_step": 0.7941410541534424,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.805202841758728,
|
|
"step": 266
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 512.0,
|
|
"completions/max_terminated_length": 512.0,
|
|
"completions/mean_length": 285.46875,
|
|
"completions/mean_terminated_length": 285.46875,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.4272,
|
|
"grad_norm": 0.028177211061120033,
|
|
"learning_rate": 6.370106761565835e-07,
|
|
"loss": 0.0141,
|
|
"num_tokens": 130672219.0,
|
|
"reward": 1.3481889963150024,
|
|
"reward_std": 0.1336720734834671,
|
|
"rewards/accuracy_reward_long_step": 0.4453125,
|
|
"rewards/final_brier_reward_long_step": 0.836502730846405,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7750029563903809,
|
|
"step": 267
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 454.0,
|
|
"completions/max_terminated_length": 454.0,
|
|
"completions/mean_length": 289.078125,
|
|
"completions/mean_terminated_length": 289.078125,
|
|
"completions/min_length": 125.0,
|
|
"completions/min_terminated_length": 125.0,
|
|
"epoch": 0.4288,
|
|
"grad_norm": 0.027222031727433205,
|
|
"learning_rate": 6.352313167259787e-07,
|
|
"loss": -0.0063,
|
|
"num_tokens": 131182759.0,
|
|
"reward": 1.3185360431671143,
|
|
"reward_std": 0.17626741528511047,
|
|
"rewards/accuracy_reward_long_step": 0.44140625,
|
|
"rewards/final_brier_reward_long_step": 0.7190519571304321,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7894670963287354,
|
|
"step": 268
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 533.0,
|
|
"completions/max_terminated_length": 533.0,
|
|
"completions/mean_length": 280.86328125,
|
|
"completions/mean_terminated_length": 280.86328125,
|
|
"completions/min_length": 134.0,
|
|
"completions/min_terminated_length": 134.0,
|
|
"epoch": 0.4304,
|
|
"grad_norm": 0.03042900562286377,
|
|
"learning_rate": 6.334519572953736e-07,
|
|
"loss": -0.0005,
|
|
"num_tokens": 131680284.0,
|
|
"reward": 1.4278100728988647,
|
|
"reward_std": 0.10464347898960114,
|
|
"rewards/accuracy_reward_long_step": 0.54296875,
|
|
"rewards/final_brier_reward_long_step": 0.7685461044311523,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7708194255828857,
|
|
"step": 269
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 501.0,
|
|
"completions/max_terminated_length": 501.0,
|
|
"completions/mean_length": 287.1796875,
|
|
"completions/mean_terminated_length": 288.305908203125,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 134.0,
|
|
"epoch": 0.432,
|
|
"grad_norm": 0.027735978364944458,
|
|
"learning_rate": 6.316725978647687e-07,
|
|
"loss": 0.0066,
|
|
"num_tokens": 132185850.0,
|
|
"reward": 1.4755172729492188,
|
|
"reward_std": 0.208018958568573,
|
|
"rewards/accuracy_reward_long_step": 0.59375,
|
|
"rewards/final_brier_reward_long_step": 0.7717519402503967,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7631298303604126,
|
|
"step": 270
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 598.0,
|
|
"completions/max_terminated_length": 598.0,
|
|
"completions/mean_length": 296.0703125,
|
|
"completions/mean_terminated_length": 296.0703125,
|
|
"completions/min_length": 144.0,
|
|
"completions/min_terminated_length": 144.0,
|
|
"epoch": 0.4336,
|
|
"grad_norm": 0.0286890659481287,
|
|
"learning_rate": 6.298932384341636e-07,
|
|
"loss": -0.0039,
|
|
"num_tokens": 132682476.0,
|
|
"reward": 1.224784255027771,
|
|
"reward_std": 0.1470840871334076,
|
|
"rewards/accuracy_reward_long_step": 0.36328125,
|
|
"rewards/final_brier_reward_long_step": 0.7217913866043091,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7242205142974854,
|
|
"step": 271
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 502.0,
|
|
"completions/max_terminated_length": 502.0,
|
|
"completions/mean_length": 290.5546875,
|
|
"completions/mean_terminated_length": 290.5546875,
|
|
"completions/min_length": 152.0,
|
|
"completions/min_terminated_length": 152.0,
|
|
"epoch": 0.4352,
|
|
"grad_norm": 0.028832513839006424,
|
|
"learning_rate": 6.281138790035587e-07,
|
|
"loss": 0.0023,
|
|
"num_tokens": 133175010.0,
|
|
"reward": 1.4400919675827026,
|
|
"reward_std": 0.13745911419391632,
|
|
"rewards/accuracy_reward_long_step": 0.54296875,
|
|
"rewards/final_brier_reward_long_step": 0.8417414426803589,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.746751606464386,
|
|
"step": 272
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 567.0,
|
|
"completions/max_terminated_length": 567.0,
|
|
"completions/mean_length": 294.5546875,
|
|
"completions/mean_terminated_length": 294.5546875,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.4368,
|
|
"grad_norm": 0.02651878260076046,
|
|
"learning_rate": 6.263345195729537e-07,
|
|
"loss": 0.005,
|
|
"num_tokens": 133671872.0,
|
|
"reward": 1.4299688339233398,
|
|
"reward_std": 0.17098167538642883,
|
|
"rewards/accuracy_reward_long_step": 0.54296875,
|
|
"rewards/final_brier_reward_long_step": 0.7499749660491943,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7980256676673889,
|
|
"step": 273
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 543.0,
|
|
"completions/max_terminated_length": 543.0,
|
|
"completions/mean_length": 294.70703125,
|
|
"completions/mean_terminated_length": 294.70703125,
|
|
"completions/min_length": 121.0,
|
|
"completions/min_terminated_length": 121.0,
|
|
"epoch": 0.4384,
|
|
"grad_norm": 0.026192937046289444,
|
|
"learning_rate": 6.245551601423488e-07,
|
|
"loss": 0.0083,
|
|
"num_tokens": 134164181.0,
|
|
"reward": 1.3008532524108887,
|
|
"reward_std": 0.15340715646743774,
|
|
"rewards/accuracy_reward_long_step": 0.3984375,
|
|
"rewards/final_brier_reward_long_step": 0.8331218957901001,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7765412330627441,
|
|
"step": 274
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 677.0,
|
|
"completions/max_terminated_length": 677.0,
|
|
"completions/mean_length": 295.03515625,
|
|
"completions/mean_terminated_length": 295.03515625,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.44,
|
|
"grad_norm": 0.027292657643556595,
|
|
"learning_rate": 6.227758007117438e-07,
|
|
"loss": 0.0172,
|
|
"num_tokens": 134653262.0,
|
|
"reward": 1.4425451755523682,
|
|
"reward_std": 0.19112396240234375,
|
|
"rewards/accuracy_reward_long_step": 0.55859375,
|
|
"rewards/final_brier_reward_long_step": 0.7454347610473633,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7981832027435303,
|
|
"step": 275
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 475.0,
|
|
"completions/max_terminated_length": 475.0,
|
|
"completions/mean_length": 280.5625,
|
|
"completions/mean_terminated_length": 280.5625,
|
|
"completions/min_length": 113.0,
|
|
"completions/min_terminated_length": 113.0,
|
|
"epoch": 0.4416,
|
|
"grad_norm": 0.03567759320139885,
|
|
"learning_rate": 6.209964412811388e-07,
|
|
"loss": -0.001,
|
|
"num_tokens": 135144110.0,
|
|
"reward": 1.4016071557998657,
|
|
"reward_std": 0.180876225233078,
|
|
"rewards/accuracy_reward_long_step": 0.53125,
|
|
"rewards/final_brier_reward_long_step": 0.7937929630279541,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6876357793807983,
|
|
"step": 276
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 615.0,
|
|
"completions/max_terminated_length": 615.0,
|
|
"completions/mean_length": 296.69921875,
|
|
"completions/mean_terminated_length": 296.69921875,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.4432,
|
|
"grad_norm": 0.027701787650585175,
|
|
"learning_rate": 6.192170818505338e-07,
|
|
"loss": 0.0152,
|
|
"num_tokens": 135657897.0,
|
|
"reward": 1.2873167991638184,
|
|
"reward_std": 0.15553465485572815,
|
|
"rewards/accuracy_reward_long_step": 0.40625,
|
|
"rewards/final_brier_reward_long_step": 0.7762769460678101,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7479904890060425,
|
|
"step": 277
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 567.0,
|
|
"completions/max_terminated_length": 567.0,
|
|
"completions/mean_length": 288.46484375,
|
|
"completions/mean_terminated_length": 288.46484375,
|
|
"completions/min_length": 115.0,
|
|
"completions/min_terminated_length": 115.0,
|
|
"epoch": 0.4448,
|
|
"grad_norm": 0.027518663555383682,
|
|
"learning_rate": 6.174377224199287e-07,
|
|
"loss": 0.0105,
|
|
"num_tokens": 136161664.0,
|
|
"reward": 1.4033050537109375,
|
|
"reward_std": 0.16255879402160645,
|
|
"rewards/accuracy_reward_long_step": 0.5078125,
|
|
"rewards/final_brier_reward_long_step": 0.807449996471405,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7745203971862793,
|
|
"step": 278
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 601.0,
|
|
"completions/max_terminated_length": 601.0,
|
|
"completions/mean_length": 289.26171875,
|
|
"completions/mean_terminated_length": 289.26171875,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.4464,
|
|
"grad_norm": 0.030349889770150185,
|
|
"learning_rate": 6.156583629893238e-07,
|
|
"loss": 0.0083,
|
|
"num_tokens": 136661251.0,
|
|
"reward": 1.3778247833251953,
|
|
"reward_std": 0.22678440809249878,
|
|
"rewards/accuracy_reward_long_step": 0.5078125,
|
|
"rewards/final_brier_reward_long_step": 0.7540500164031982,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7259989976882935,
|
|
"step": 279
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 492.0,
|
|
"completions/max_terminated_length": 492.0,
|
|
"completions/mean_length": 292.5859375,
|
|
"completions/mean_terminated_length": 292.5859375,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.448,
|
|
"grad_norm": 0.026998843997716904,
|
|
"learning_rate": 6.138790035587188e-07,
|
|
"loss": 0.0038,
|
|
"num_tokens": 137162705.0,
|
|
"reward": 1.552132248878479,
|
|
"reward_std": 0.09305281937122345,
|
|
"rewards/accuracy_reward_long_step": 0.640625,
|
|
"rewards/final_brier_reward_long_step": 0.8342460989952087,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.811782956123352,
|
|
"step": 280
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 539.0,
|
|
"completions/max_terminated_length": 539.0,
|
|
"completions/mean_length": 301.2265625,
|
|
"completions/mean_terminated_length": 301.2265625,
|
|
"completions/min_length": 141.0,
|
|
"completions/min_terminated_length": 141.0,
|
|
"epoch": 0.4496,
|
|
"grad_norm": 0.02617485634982586,
|
|
"learning_rate": 6.120996441281139e-07,
|
|
"loss": -0.0058,
|
|
"num_tokens": 137672107.0,
|
|
"reward": 1.3473117351531982,
|
|
"reward_std": 0.1873582899570465,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.6998116970062256,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.783184826374054,
|
|
"step": 281
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 560.0,
|
|
"completions/max_terminated_length": 560.0,
|
|
"completions/mean_length": 291.34375,
|
|
"completions/mean_terminated_length": 291.34375,
|
|
"completions/min_length": 121.0,
|
|
"completions/min_terminated_length": 121.0,
|
|
"epoch": 0.4512,
|
|
"grad_norm": 0.02765418216586113,
|
|
"learning_rate": 6.103202846975089e-07,
|
|
"loss": 0.0093,
|
|
"num_tokens": 138184259.0,
|
|
"reward": 1.489346981048584,
|
|
"reward_std": 0.16857793927192688,
|
|
"rewards/accuracy_reward_long_step": 0.59375,
|
|
"rewards/final_brier_reward_long_step": 0.8282409906387329,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7541468143463135,
|
|
"step": 282
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 581.0,
|
|
"completions/max_terminated_length": 581.0,
|
|
"completions/mean_length": 286.015625,
|
|
"completions/mean_terminated_length": 286.015625,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.4528,
|
|
"grad_norm": 0.029208846390247345,
|
|
"learning_rate": 6.085409252669039e-07,
|
|
"loss": -0.0042,
|
|
"num_tokens": 138692327.0,
|
|
"reward": 1.3559755086898804,
|
|
"reward_std": 0.20295041799545288,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.7214418053627014,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7024602293968201,
|
|
"step": 283
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 609.0,
|
|
"completions/max_terminated_length": 609.0,
|
|
"completions/mean_length": 301.9296875,
|
|
"completions/mean_terminated_length": 301.9296875,
|
|
"completions/min_length": 109.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 0.4544,
|
|
"grad_norm": 0.030748292803764343,
|
|
"learning_rate": 6.067615658362989e-07,
|
|
"loss": -0.0041,
|
|
"num_tokens": 139194445.0,
|
|
"reward": 1.1845823526382446,
|
|
"reward_std": 0.12908682227134705,
|
|
"rewards/accuracy_reward_long_step": 0.30859375,
|
|
"rewards/final_brier_reward_long_step": 0.7271432876586914,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7768109440803528,
|
|
"step": 284
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 529.0,
|
|
"completions/max_terminated_length": 529.0,
|
|
"completions/mean_length": 282.84765625,
|
|
"completions/mean_terminated_length": 282.84765625,
|
|
"completions/min_length": 114.0,
|
|
"completions/min_terminated_length": 114.0,
|
|
"epoch": 0.456,
|
|
"grad_norm": 0.0292031429708004,
|
|
"learning_rate": 6.04982206405694e-07,
|
|
"loss": 0.0071,
|
|
"num_tokens": 139705542.0,
|
|
"reward": 1.32478928565979,
|
|
"reward_std": 0.21371236443519592,
|
|
"rewards/accuracy_reward_long_step": 0.43359375,
|
|
"rewards/final_brier_reward_long_step": 0.7342382073402405,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8305443525314331,
|
|
"step": 285
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 494.0,
|
|
"completions/max_terminated_length": 494.0,
|
|
"completions/mean_length": 290.70703125,
|
|
"completions/mean_terminated_length": 290.70703125,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.4576,
|
|
"grad_norm": 0.027747957035899162,
|
|
"learning_rate": 6.032028469750889e-07,
|
|
"loss": -0.0002,
|
|
"num_tokens": 140203851.0,
|
|
"reward": 1.3524994850158691,
|
|
"reward_std": 0.12304867804050446,
|
|
"rewards/accuracy_reward_long_step": 0.44921875,
|
|
"rewards/final_brier_reward_long_step": 0.7922519445419312,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8208708167076111,
|
|
"step": 286
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 534.0,
|
|
"completions/max_terminated_length": 534.0,
|
|
"completions/mean_length": 293.1640625,
|
|
"completions/mean_terminated_length": 293.1640625,
|
|
"completions/min_length": 163.0,
|
|
"completions/min_terminated_length": 163.0,
|
|
"epoch": 0.4592,
|
|
"grad_norm": 0.027378322556614876,
|
|
"learning_rate": 6.014234875444839e-07,
|
|
"loss": 0.013,
|
|
"num_tokens": 140702597.0,
|
|
"reward": 1.173850178718567,
|
|
"reward_std": 0.15952864289283752,
|
|
"rewards/accuracy_reward_long_step": 0.31640625,
|
|
"rewards/final_brier_reward_long_step": 0.7225792407989502,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7071964144706726,
|
|
"step": 287
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 671.0,
|
|
"completions/max_terminated_length": 671.0,
|
|
"completions/mean_length": 285.3203125,
|
|
"completions/mean_terminated_length": 285.3203125,
|
|
"completions/min_length": 151.0,
|
|
"completions/min_terminated_length": 151.0,
|
|
"epoch": 0.4608,
|
|
"grad_norm": 0.02744467370212078,
|
|
"learning_rate": 5.99644128113879e-07,
|
|
"loss": 0.0087,
|
|
"num_tokens": 141206695.0,
|
|
"reward": 1.2383294105529785,
|
|
"reward_std": 0.14181900024414062,
|
|
"rewards/accuracy_reward_long_step": 0.359375,
|
|
"rewards/final_brier_reward_long_step": 0.7510156631469727,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7648018598556519,
|
|
"step": 288
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 440.0,
|
|
"completions/max_terminated_length": 440.0,
|
|
"completions/mean_length": 272.34375,
|
|
"completions/mean_terminated_length": 272.34375,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.4624,
|
|
"grad_norm": 0.02806474268436432,
|
|
"learning_rate": 5.97864768683274e-07,
|
|
"loss": 0.0099,
|
|
"num_tokens": 141695007.0,
|
|
"reward": 1.2011268138885498,
|
|
"reward_std": 0.14036910235881805,
|
|
"rewards/accuracy_reward_long_step": 0.3359375,
|
|
"rewards/final_brier_reward_long_step": 0.7097039222717285,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7510532736778259,
|
|
"step": 289
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 470.0,
|
|
"completions/max_terminated_length": 470.0,
|
|
"completions/mean_length": 266.53125,
|
|
"completions/mean_terminated_length": 266.53125,
|
|
"completions/min_length": 148.0,
|
|
"completions/min_terminated_length": 148.0,
|
|
"epoch": 0.464,
|
|
"grad_norm": 0.028714032843708992,
|
|
"learning_rate": 5.96085409252669e-07,
|
|
"loss": 0.0078,
|
|
"num_tokens": 142182015.0,
|
|
"reward": 1.4051401615142822,
|
|
"reward_std": 0.19630657136440277,
|
|
"rewards/accuracy_reward_long_step": 0.515625,
|
|
"rewards/final_brier_reward_long_step": 0.7274429798126221,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8306175470352173,
|
|
"step": 290
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 440.0,
|
|
"completions/max_terminated_length": 440.0,
|
|
"completions/mean_length": 264.8203125,
|
|
"completions/mean_terminated_length": 264.8203125,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.4656,
|
|
"grad_norm": 0.02966773696243763,
|
|
"learning_rate": 5.94306049822064e-07,
|
|
"loss": -0.0106,
|
|
"num_tokens": 142663985.0,
|
|
"reward": 1.3878998756408691,
|
|
"reward_std": 0.13589531183242798,
|
|
"rewards/accuracy_reward_long_step": 0.515625,
|
|
"rewards/final_brier_reward_long_step": 0.7204523086547852,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7764595746994019,
|
|
"step": 291
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 501.0,
|
|
"completions/max_terminated_length": 501.0,
|
|
"completions/mean_length": 279.4765625,
|
|
"completions/mean_terminated_length": 279.4765625,
|
|
"completions/min_length": 173.0,
|
|
"completions/min_terminated_length": 173.0,
|
|
"epoch": 0.4672,
|
|
"grad_norm": 0.02832869067788124,
|
|
"learning_rate": 5.925266903914591e-07,
|
|
"loss": 0.005,
|
|
"num_tokens": 143170555.0,
|
|
"reward": 1.2539631128311157,
|
|
"reward_std": 0.18463820219039917,
|
|
"rewards/accuracy_reward_long_step": 0.3984375,
|
|
"rewards/final_brier_reward_long_step": 0.6404902338981628,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7816122174263,
|
|
"step": 292
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 558.0,
|
|
"completions/max_terminated_length": 558.0,
|
|
"completions/mean_length": 261.421875,
|
|
"completions/mean_terminated_length": 261.421875,
|
|
"completions/min_length": 142.0,
|
|
"completions/min_terminated_length": 142.0,
|
|
"epoch": 0.4688,
|
|
"grad_norm": 0.03127996623516083,
|
|
"learning_rate": 5.90747330960854e-07,
|
|
"loss": 0.0174,
|
|
"num_tokens": 143668759.0,
|
|
"reward": 1.3849046230316162,
|
|
"reward_std": 0.14204376935958862,
|
|
"rewards/accuracy_reward_long_step": 0.48828125,
|
|
"rewards/final_brier_reward_long_step": 0.8194859027862549,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.767007052898407,
|
|
"step": 293
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 525.0,
|
|
"completions/max_terminated_length": 525.0,
|
|
"completions/mean_length": 280.4375,
|
|
"completions/mean_terminated_length": 280.4375,
|
|
"completions/min_length": 137.0,
|
|
"completions/min_terminated_length": 137.0,
|
|
"epoch": 0.4704,
|
|
"grad_norm": 0.030284756794571877,
|
|
"learning_rate": 5.889679715302491e-07,
|
|
"loss": -0.0029,
|
|
"num_tokens": 144175575.0,
|
|
"reward": 1.3805763721466064,
|
|
"reward_std": 0.18626053631305695,
|
|
"rewards/accuracy_reward_long_step": 0.48828125,
|
|
"rewards/final_brier_reward_long_step": 0.7643499970436096,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8048302531242371,
|
|
"step": 294
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 481.0,
|
|
"completions/max_terminated_length": 481.0,
|
|
"completions/mean_length": 267.71484375,
|
|
"completions/mean_terminated_length": 267.71484375,
|
|
"completions/min_length": 107.0,
|
|
"completions/min_terminated_length": 107.0,
|
|
"epoch": 0.472,
|
|
"grad_norm": 0.02768511138856411,
|
|
"learning_rate": 5.871886120996441e-07,
|
|
"loss": -0.0029,
|
|
"num_tokens": 144659334.0,
|
|
"reward": 1.3032722473144531,
|
|
"reward_std": 0.13138622045516968,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.7461843490600586,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8106545805931091,
|
|
"step": 295
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 436.0,
|
|
"completions/max_terminated_length": 436.0,
|
|
"completions/mean_length": 271.9453125,
|
|
"completions/mean_terminated_length": 273.01177978515625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 121.0,
|
|
"epoch": 0.4736,
|
|
"grad_norm": 0.03377687931060791,
|
|
"learning_rate": 5.854092526690391e-07,
|
|
"loss": 0.0064,
|
|
"num_tokens": 145164320.0,
|
|
"reward": 1.291682243347168,
|
|
"reward_std": 0.21950051188468933,
|
|
"rewards/accuracy_reward_long_step": 0.3984375,
|
|
"rewards/final_brier_reward_long_step": 0.7516234517097473,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8291677236557007,
|
|
"step": 296
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 491.0,
|
|
"completions/max_terminated_length": 491.0,
|
|
"completions/mean_length": 261.671875,
|
|
"completions/mean_terminated_length": 261.671875,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.4752,
|
|
"grad_norm": 0.0278344564139843,
|
|
"learning_rate": 5.836298932384342e-07,
|
|
"loss": -0.0083,
|
|
"num_tokens": 145661492.0,
|
|
"reward": 1.4372165203094482,
|
|
"reward_std": 0.19561487436294556,
|
|
"rewards/accuracy_reward_long_step": 0.52734375,
|
|
"rewards/final_brier_reward_long_step": 0.7994953393936157,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8478083610534668,
|
|
"step": 297
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 459.0,
|
|
"completions/max_terminated_length": 459.0,
|
|
"completions/mean_length": 260.375,
|
|
"completions/mean_terminated_length": 260.375,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.4768,
|
|
"grad_norm": 0.030550826340913773,
|
|
"learning_rate": 5.818505338078291e-07,
|
|
"loss": 0.0046,
|
|
"num_tokens": 146145276.0,
|
|
"reward": 1.3957126140594482,
|
|
"reward_std": 0.1581364870071411,
|
|
"rewards/accuracy_reward_long_step": 0.53125,
|
|
"rewards/final_brier_reward_long_step": 0.7615882754325867,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6962625980377197,
|
|
"step": 298
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 439.0,
|
|
"completions/max_terminated_length": 439.0,
|
|
"completions/mean_length": 262.453125,
|
|
"completions/mean_terminated_length": 262.453125,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.4784,
|
|
"grad_norm": 0.02928241901099682,
|
|
"learning_rate": 5.800711743772242e-07,
|
|
"loss": -0.0049,
|
|
"num_tokens": 146631200.0,
|
|
"reward": 1.560309648513794,
|
|
"reward_std": 0.15518754720687866,
|
|
"rewards/accuracy_reward_long_step": 0.68359375,
|
|
"rewards/final_brier_reward_long_step": 0.7499921321868896,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.756871223449707,
|
|
"step": 299
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 585.0,
|
|
"completions/max_terminated_length": 585.0,
|
|
"completions/mean_length": 270.3203125,
|
|
"completions/mean_terminated_length": 270.3203125,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.48,
|
|
"grad_norm": 0.032593853771686554,
|
|
"learning_rate": 5.782918149466191e-07,
|
|
"loss": 0.0044,
|
|
"num_tokens": 147121786.0,
|
|
"reward": 1.4181792736053467,
|
|
"reward_std": 0.17730304598808289,
|
|
"rewards/accuracy_reward_long_step": 0.51953125,
|
|
"rewards/final_brier_reward_long_step": 0.7884241342544556,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8061679601669312,
|
|
"step": 300
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 471.0,
|
|
"completions/max_terminated_length": 471.0,
|
|
"completions/mean_length": 275.15625,
|
|
"completions/mean_terminated_length": 275.15625,
|
|
"completions/min_length": 143.0,
|
|
"completions/min_terminated_length": 143.0,
|
|
"epoch": 0.4816,
|
|
"grad_norm": 0.028711630031466484,
|
|
"learning_rate": 5.765124555160142e-07,
|
|
"loss": 0.0037,
|
|
"num_tokens": 147618706.0,
|
|
"reward": 1.3091957569122314,
|
|
"reward_std": 0.17498339712619781,
|
|
"rewards/accuracy_reward_long_step": 0.42578125,
|
|
"rewards/final_brier_reward_long_step": 0.6745136976242065,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.859144389629364,
|
|
"step": 301
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 469.0,
|
|
"completions/max_terminated_length": 469.0,
|
|
"completions/mean_length": 272.421875,
|
|
"completions/mean_terminated_length": 272.421875,
|
|
"completions/min_length": 130.0,
|
|
"completions/min_terminated_length": 130.0,
|
|
"epoch": 0.4832,
|
|
"grad_norm": 0.02766992151737213,
|
|
"learning_rate": 5.747330960854092e-07,
|
|
"loss": -0.0112,
|
|
"num_tokens": 148128422.0,
|
|
"reward": 1.4776198863983154,
|
|
"reward_std": 0.13255634903907776,
|
|
"rewards/accuracy_reward_long_step": 0.58984375,
|
|
"rewards/final_brier_reward_long_step": 0.7580232620239258,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7930811643600464,
|
|
"step": 302
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 464.0,
|
|
"completions/max_terminated_length": 464.0,
|
|
"completions/mean_length": 270.1875,
|
|
"completions/mean_terminated_length": 270.1875,
|
|
"completions/min_length": 122.0,
|
|
"completions/min_terminated_length": 122.0,
|
|
"epoch": 0.4848,
|
|
"grad_norm": 0.03053920716047287,
|
|
"learning_rate": 5.729537366548043e-07,
|
|
"loss": 0.0108,
|
|
"num_tokens": 148619142.0,
|
|
"reward": 1.3393454551696777,
|
|
"reward_std": 0.1206967830657959,
|
|
"rewards/accuracy_reward_long_step": 0.44921875,
|
|
"rewards/final_brier_reward_long_step": 0.7705498933792114,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7899569272994995,
|
|
"step": 303
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 578.0,
|
|
"completions/max_terminated_length": 578.0,
|
|
"completions/mean_length": 275.53125,
|
|
"completions/mean_terminated_length": 275.53125,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.4864,
|
|
"grad_norm": 0.0303883645683527,
|
|
"learning_rate": 5.711743772241993e-07,
|
|
"loss": 0.0186,
|
|
"num_tokens": 149120358.0,
|
|
"reward": 1.4402146339416504,
|
|
"reward_std": 0.17118000984191895,
|
|
"rewards/accuracy_reward_long_step": 0.5234375,
|
|
"rewards/final_brier_reward_long_step": 0.8297659158706665,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8373425006866455,
|
|
"step": 304
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 565.0,
|
|
"completions/max_terminated_length": 565.0,
|
|
"completions/mean_length": 282.84765625,
|
|
"completions/mean_terminated_length": 282.84765625,
|
|
"completions/min_length": 140.0,
|
|
"completions/min_terminated_length": 140.0,
|
|
"epoch": 0.488,
|
|
"grad_norm": 0.02846652828156948,
|
|
"learning_rate": 5.693950177935943e-07,
|
|
"loss": 0.0182,
|
|
"num_tokens": 149610279.0,
|
|
"reward": 1.5291298627853394,
|
|
"reward_std": 0.18615154922008514,
|
|
"rewards/accuracy_reward_long_step": 0.62890625,
|
|
"rewards/final_brier_reward_long_step": 0.8304492235183716,
|
|
"rewards/format_reward_long_step": 0.9921875,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7860701084136963,
|
|
"step": 305
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 509.0,
|
|
"completions/max_terminated_length": 509.0,
|
|
"completions/mean_length": 264.23046875,
|
|
"completions/mean_terminated_length": 264.23046875,
|
|
"completions/min_length": 134.0,
|
|
"completions/min_terminated_length": 134.0,
|
|
"epoch": 0.4896,
|
|
"grad_norm": 0.03294990211725235,
|
|
"learning_rate": 5.676156583629893e-07,
|
|
"loss": 0.0049,
|
|
"num_tokens": 150121194.0,
|
|
"reward": 1.437424659729004,
|
|
"reward_std": 0.1931784749031067,
|
|
"rewards/accuracy_reward_long_step": 0.55078125,
|
|
"rewards/final_brier_reward_long_step": 0.7560929656028748,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7904808521270752,
|
|
"step": 306
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 466.0,
|
|
"completions/max_terminated_length": 466.0,
|
|
"completions/mean_length": 270.6640625,
|
|
"completions/mean_terminated_length": 270.6640625,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.4912,
|
|
"grad_norm": 0.028145214542746544,
|
|
"learning_rate": 5.658362989323842e-07,
|
|
"loss": 0.0125,
|
|
"num_tokens": 150638356.0,
|
|
"reward": 1.3298184871673584,
|
|
"reward_std": 0.15343712270259857,
|
|
"rewards/accuracy_reward_long_step": 0.45703125,
|
|
"rewards/final_brier_reward_long_step": 0.7142324447631836,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.77691650390625,
|
|
"step": 307
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 506.0,
|
|
"completions/max_terminated_length": 506.0,
|
|
"completions/mean_length": 265.69140625,
|
|
"completions/mean_terminated_length": 265.69140625,
|
|
"completions/min_length": 109.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 0.4928,
|
|
"grad_norm": 0.03243206813931465,
|
|
"learning_rate": 5.640569395017794e-07,
|
|
"loss": -0.0123,
|
|
"num_tokens": 151144573.0,
|
|
"reward": 1.4519245624542236,
|
|
"reward_std": 0.1609051525592804,
|
|
"rewards/accuracy_reward_long_step": 0.5859375,
|
|
"rewards/final_brier_reward_long_step": 0.7463042736053467,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7176437377929688,
|
|
"step": 308
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 525.0,
|
|
"completions/max_terminated_length": 525.0,
|
|
"completions/mean_length": 253.2734375,
|
|
"completions/mean_terminated_length": 253.2734375,
|
|
"completions/min_length": 143.0,
|
|
"completions/min_terminated_length": 143.0,
|
|
"epoch": 0.4944,
|
|
"grad_norm": 0.028735455125570297,
|
|
"learning_rate": 5.622775800711744e-07,
|
|
"loss": 0.0077,
|
|
"num_tokens": 151640987.0,
|
|
"reward": 1.323132038116455,
|
|
"reward_std": 0.12191449105739594,
|
|
"rewards/accuracy_reward_long_step": 0.4453125,
|
|
"rewards/final_brier_reward_long_step": 0.7362834215164185,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7749943733215332,
|
|
"step": 309
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 517.0,
|
|
"completions/max_terminated_length": 517.0,
|
|
"completions/mean_length": 267.671875,
|
|
"completions/mean_terminated_length": 267.671875,
|
|
"completions/min_length": 171.0,
|
|
"completions/min_terminated_length": 171.0,
|
|
"epoch": 0.496,
|
|
"grad_norm": 0.03202186897397041,
|
|
"learning_rate": 5.604982206405694e-07,
|
|
"loss": 0.0031,
|
|
"num_tokens": 152130871.0,
|
|
"reward": 1.4370348453521729,
|
|
"reward_std": 0.1757480651140213,
|
|
"rewards/accuracy_reward_long_step": 0.57421875,
|
|
"rewards/final_brier_reward_long_step": 0.7055065631866455,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.745758056640625,
|
|
"step": 310
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 476.0,
|
|
"completions/max_terminated_length": 476.0,
|
|
"completions/mean_length": 268.640625,
|
|
"completions/mean_terminated_length": 268.640625,
|
|
"completions/min_length": 158.0,
|
|
"completions/min_terminated_length": 158.0,
|
|
"epoch": 0.4976,
|
|
"grad_norm": 0.029871582984924316,
|
|
"learning_rate": 5.587188612099644e-07,
|
|
"loss": 0.0021,
|
|
"num_tokens": 152631003.0,
|
|
"reward": 1.5090341567993164,
|
|
"reward_std": 0.16095715761184692,
|
|
"rewards/accuracy_reward_long_step": 0.625,
|
|
"rewards/final_brier_reward_long_step": 0.764398455619812,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7717381715774536,
|
|
"step": 311
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 524.0,
|
|
"completions/max_terminated_length": 524.0,
|
|
"completions/mean_length": 259.47265625,
|
|
"completions/mean_terminated_length": 259.47265625,
|
|
"completions/min_length": 142.0,
|
|
"completions/min_terminated_length": 142.0,
|
|
"epoch": 0.4992,
|
|
"grad_norm": 0.03202946111559868,
|
|
"learning_rate": 5.569395017793594e-07,
|
|
"loss": -0.006,
|
|
"num_tokens": 153126428.0,
|
|
"reward": 1.4881701469421387,
|
|
"reward_std": 0.14949887990951538,
|
|
"rewards/accuracy_reward_long_step": 0.5859375,
|
|
"rewards/final_brier_reward_long_step": 0.7662238478660583,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.842707097530365,
|
|
"step": 312
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 515.0,
|
|
"completions/max_terminated_length": 515.0,
|
|
"completions/mean_length": 249.61328125,
|
|
"completions/mean_terminated_length": 249.61328125,
|
|
"completions/min_length": 94.0,
|
|
"completions/min_terminated_length": 94.0,
|
|
"epoch": 0.5008,
|
|
"grad_norm": 0.03125175088644028,
|
|
"learning_rate": 5.551601423487544e-07,
|
|
"loss": 0.0167,
|
|
"num_tokens": 153607737.0,
|
|
"reward": 1.461435079574585,
|
|
"reward_std": 0.16793528199195862,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.8274839520454407,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7682562470436096,
|
|
"step": 313
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 547.0,
|
|
"completions/max_terminated_length": 547.0,
|
|
"completions/mean_length": 260.56640625,
|
|
"completions/mean_terminated_length": 260.56640625,
|
|
"completions/min_length": 122.0,
|
|
"completions/min_terminated_length": 122.0,
|
|
"epoch": 0.5024,
|
|
"grad_norm": 0.03080155700445175,
|
|
"learning_rate": 5.533807829181495e-07,
|
|
"loss": -0.0045,
|
|
"num_tokens": 154100658.0,
|
|
"reward": 1.3649942874908447,
|
|
"reward_std": 0.20213352143764496,
|
|
"rewards/accuracy_reward_long_step": 0.48828125,
|
|
"rewards/final_brier_reward_long_step": 0.7633898258209229,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7434619665145874,
|
|
"step": 314
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 465.0,
|
|
"completions/max_terminated_length": 465.0,
|
|
"completions/mean_length": 260.87109375,
|
|
"completions/mean_terminated_length": 260.87109375,
|
|
"completions/min_length": 92.0,
|
|
"completions/min_terminated_length": 92.0,
|
|
"epoch": 0.504,
|
|
"grad_norm": 0.030286213383078575,
|
|
"learning_rate": 5.516014234875445e-07,
|
|
"loss": 0.0064,
|
|
"num_tokens": 154586593.0,
|
|
"reward": 1.4095053672790527,
|
|
"reward_std": 0.19581949710845947,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.7196574211120605,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7621144652366638,
|
|
"step": 315
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 575.0,
|
|
"completions/max_terminated_length": 575.0,
|
|
"completions/mean_length": 282.4765625,
|
|
"completions/mean_terminated_length": 282.4765625,
|
|
"completions/min_length": 146.0,
|
|
"completions/min_terminated_length": 146.0,
|
|
"epoch": 0.5056,
|
|
"grad_norm": 0.03086298704147339,
|
|
"learning_rate": 5.498220640569395e-07,
|
|
"loss": -0.0094,
|
|
"num_tokens": 155107683.0,
|
|
"reward": 1.233945608139038,
|
|
"reward_std": 0.2137664556503296,
|
|
"rewards/accuracy_reward_long_step": 0.40234375,
|
|
"rewards/final_brier_reward_long_step": 0.5642339587211609,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7621732950210571,
|
|
"step": 316
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 646.0,
|
|
"completions/max_terminated_length": 646.0,
|
|
"completions/mean_length": 273.9296875,
|
|
"completions/mean_terminated_length": 273.9296875,
|
|
"completions/min_length": 155.0,
|
|
"completions/min_terminated_length": 155.0,
|
|
"epoch": 0.5072,
|
|
"grad_norm": 0.029751170426607132,
|
|
"learning_rate": 5.480427046263345e-07,
|
|
"loss": 0.0058,
|
|
"num_tokens": 155611881.0,
|
|
"reward": 1.3837661743164062,
|
|
"reward_std": 0.1578415334224701,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.7951062917709351,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8024587631225586,
|
|
"step": 317
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 655.0,
|
|
"completions/max_terminated_length": 655.0,
|
|
"completions/mean_length": 270.0625,
|
|
"completions/mean_terminated_length": 270.0625,
|
|
"completions/min_length": 138.0,
|
|
"completions/min_terminated_length": 138.0,
|
|
"epoch": 0.5088,
|
|
"grad_norm": 0.030748968943953514,
|
|
"learning_rate": 5.462633451957295e-07,
|
|
"loss": 0.0082,
|
|
"num_tokens": 156108097.0,
|
|
"reward": 1.4630606174468994,
|
|
"reward_std": 0.1447874754667282,
|
|
"rewards/accuracy_reward_long_step": 0.5546875,
|
|
"rewards/final_brier_reward_long_step": 0.8417631983757019,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7917290329933167,
|
|
"step": 318
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 568.0,
|
|
"completions/max_terminated_length": 568.0,
|
|
"completions/mean_length": 261.6328125,
|
|
"completions/mean_terminated_length": 261.6328125,
|
|
"completions/min_length": 141.0,
|
|
"completions/min_terminated_length": 141.0,
|
|
"epoch": 0.5104,
|
|
"grad_norm": 0.03255579620599747,
|
|
"learning_rate": 5.444839857651245e-07,
|
|
"loss": 0.0189,
|
|
"num_tokens": 156600459.0,
|
|
"reward": 1.453078031539917,
|
|
"reward_std": 0.21487998962402344,
|
|
"rewards/accuracy_reward_long_step": 0.55859375,
|
|
"rewards/final_brier_reward_long_step": 0.7861437797546387,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7917930483818054,
|
|
"step": 319
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 482.0,
|
|
"completions/max_terminated_length": 482.0,
|
|
"completions/mean_length": 255.890625,
|
|
"completions/mean_terminated_length": 255.890625,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.512,
|
|
"grad_norm": 0.03229495882987976,
|
|
"learning_rate": 5.427046263345195e-07,
|
|
"loss": 0.0071,
|
|
"num_tokens": 157085455.0,
|
|
"reward": 1.3672808408737183,
|
|
"reward_std": 0.13604342937469482,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.7324371337890625,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8148113489151001,
|
|
"step": 320
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 508.0,
|
|
"completions/max_terminated_length": 508.0,
|
|
"completions/mean_length": 260.43359375,
|
|
"completions/mean_terminated_length": 260.43359375,
|
|
"completions/min_length": 114.0,
|
|
"completions/min_terminated_length": 114.0,
|
|
"epoch": 0.5136,
|
|
"grad_norm": 0.03101903200149536,
|
|
"learning_rate": 5.409252669039146e-07,
|
|
"loss": 0.0156,
|
|
"num_tokens": 157573070.0,
|
|
"reward": 1.4484410285949707,
|
|
"reward_std": 0.19034847617149353,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.7853777408599854,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.758386492729187,
|
|
"step": 321
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 582.0,
|
|
"completions/max_terminated_length": 582.0,
|
|
"completions/mean_length": 259.2109375,
|
|
"completions/mean_terminated_length": 259.2109375,
|
|
"completions/min_length": 101.0,
|
|
"completions/min_terminated_length": 101.0,
|
|
"epoch": 0.5152,
|
|
"grad_norm": 0.032188545912504196,
|
|
"learning_rate": 5.391459074733096e-07,
|
|
"loss": 0.0058,
|
|
"num_tokens": 158069828.0,
|
|
"reward": 1.2921218872070312,
|
|
"reward_std": 0.1610349416732788,
|
|
"rewards/accuracy_reward_long_step": 0.4453125,
|
|
"rewards/final_brier_reward_long_step": 0.6915820837020874,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.703468382358551,
|
|
"step": 322
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 503.0,
|
|
"completions/max_terminated_length": 503.0,
|
|
"completions/mean_length": 253.05859375,
|
|
"completions/mean_terminated_length": 253.05859375,
|
|
"completions/min_length": 114.0,
|
|
"completions/min_terminated_length": 114.0,
|
|
"epoch": 0.5168,
|
|
"grad_norm": 0.030405355617403984,
|
|
"learning_rate": 5.373665480427047e-07,
|
|
"loss": 0.0005,
|
|
"num_tokens": 158552475.0,
|
|
"reward": 1.4460304975509644,
|
|
"reward_std": 0.18330608308315277,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.8409663438796997,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7869055271148682,
|
|
"step": 323
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 450.0,
|
|
"completions/max_terminated_length": 450.0,
|
|
"completions/mean_length": 249.57421875,
|
|
"completions/mean_terminated_length": 249.57421875,
|
|
"completions/min_length": 112.0,
|
|
"completions/min_terminated_length": 112.0,
|
|
"epoch": 0.5184,
|
|
"grad_norm": 0.033257272094488144,
|
|
"learning_rate": 5.355871886120996e-07,
|
|
"loss": -0.0137,
|
|
"num_tokens": 159043982.0,
|
|
"reward": 1.4014875888824463,
|
|
"reward_std": 0.10161018371582031,
|
|
"rewards/accuracy_reward_long_step": 0.49609375,
|
|
"rewards/final_brier_reward_long_step": 0.8111592531204224,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8104158043861389,
|
|
"step": 324
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 517.0,
|
|
"completions/max_terminated_length": 517.0,
|
|
"completions/mean_length": 254.00390625,
|
|
"completions/mean_terminated_length": 254.00390625,
|
|
"completions/min_length": 141.0,
|
|
"completions/min_terminated_length": 141.0,
|
|
"epoch": 0.52,
|
|
"grad_norm": 0.03563224524259567,
|
|
"learning_rate": 5.338078291814946e-07,
|
|
"loss": -0.0095,
|
|
"num_tokens": 159536455.0,
|
|
"reward": 1.3375245332717896,
|
|
"reward_std": 0.16401183605194092,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.6796140670776367,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7954840660095215,
|
|
"step": 325
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 461.0,
|
|
"completions/max_terminated_length": 461.0,
|
|
"completions/mean_length": 242.38671875,
|
|
"completions/mean_terminated_length": 242.38671875,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.5216,
|
|
"grad_norm": 0.034420643001794815,
|
|
"learning_rate": 5.320284697508896e-07,
|
|
"loss": 0.0041,
|
|
"num_tokens": 160011394.0,
|
|
"reward": 1.3240625858306885,
|
|
"reward_std": 0.15140679478645325,
|
|
"rewards/accuracy_reward_long_step": 0.4609375,
|
|
"rewards/final_brier_reward_long_step": 0.7299777269363403,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7225224375724792,
|
|
"step": 326
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 497.0,
|
|
"completions/max_terminated_length": 497.0,
|
|
"completions/mean_length": 248.5546875,
|
|
"completions/mean_terminated_length": 248.5546875,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.5232,
|
|
"grad_norm": 0.03537153825163841,
|
|
"learning_rate": 5.302491103202846e-07,
|
|
"loss": -0.012,
|
|
"num_tokens": 160502680.0,
|
|
"reward": 1.5624842643737793,
|
|
"reward_std": 0.18345743417739868,
|
|
"rewards/accuracy_reward_long_step": 0.69921875,
|
|
"rewards/final_brier_reward_long_step": 0.7545043230056763,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6985577344894409,
|
|
"step": 327
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 608.0,
|
|
"completions/max_terminated_length": 608.0,
|
|
"completions/mean_length": 255.44921875,
|
|
"completions/mean_terminated_length": 255.44921875,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.5248,
|
|
"grad_norm": 0.038355033844709396,
|
|
"learning_rate": 5.284697508896797e-07,
|
|
"loss": 0.0104,
|
|
"num_tokens": 160992083.0,
|
|
"reward": 1.406353235244751,
|
|
"reward_std": 0.15258005261421204,
|
|
"rewards/accuracy_reward_long_step": 0.51953125,
|
|
"rewards/final_brier_reward_long_step": 0.7742776274681091,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7730100750923157,
|
|
"step": 328
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 478.0,
|
|
"completions/max_terminated_length": 478.0,
|
|
"completions/mean_length": 249.23046875,
|
|
"completions/mean_terminated_length": 249.23046875,
|
|
"completions/min_length": 96.0,
|
|
"completions/min_terminated_length": 96.0,
|
|
"epoch": 0.5264,
|
|
"grad_norm": 0.03090524673461914,
|
|
"learning_rate": 5.266903914590747e-07,
|
|
"loss": 0.0061,
|
|
"num_tokens": 161497718.0,
|
|
"reward": 1.4268403053283691,
|
|
"reward_std": 0.11180461198091507,
|
|
"rewards/accuracy_reward_long_step": 0.49609375,
|
|
"rewards/final_brier_reward_long_step": 0.8800667524337769,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8429189324378967,
|
|
"step": 329
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 442.0,
|
|
"completions/max_terminated_length": 442.0,
|
|
"completions/mean_length": 250.3515625,
|
|
"completions/mean_terminated_length": 250.3515625,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.528,
|
|
"grad_norm": 0.034447081387043,
|
|
"learning_rate": 5.249110320284698e-07,
|
|
"loss": 0.0033,
|
|
"num_tokens": 161980056.0,
|
|
"reward": 1.3684167861938477,
|
|
"reward_std": 0.18932107090950012,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.7735214829444885,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7626460790634155,
|
|
"step": 330
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 541.0,
|
|
"completions/max_terminated_length": 541.0,
|
|
"completions/mean_length": 249.62109375,
|
|
"completions/mean_terminated_length": 249.62109375,
|
|
"completions/min_length": 101.0,
|
|
"completions/min_terminated_length": 101.0,
|
|
"epoch": 0.5296,
|
|
"grad_norm": 0.034051742404699326,
|
|
"learning_rate": 5.231316725978647e-07,
|
|
"loss": -0.0067,
|
|
"num_tokens": 162477343.0,
|
|
"reward": 1.403045654296875,
|
|
"reward_std": 0.18791253864765167,
|
|
"rewards/accuracy_reward_long_step": 0.5234375,
|
|
"rewards/final_brier_reward_long_step": 0.7827702760696411,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7434748411178589,
|
|
"step": 331
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 618.0,
|
|
"completions/max_terminated_length": 618.0,
|
|
"completions/mean_length": 243.046875,
|
|
"completions/mean_terminated_length": 243.046875,
|
|
"completions/min_length": 98.0,
|
|
"completions/min_terminated_length": 98.0,
|
|
"epoch": 0.5312,
|
|
"grad_norm": 0.03361974656581879,
|
|
"learning_rate": 5.213523131672598e-07,
|
|
"loss": 0.0114,
|
|
"num_tokens": 162956923.0,
|
|
"reward": 1.544013500213623,
|
|
"reward_std": 0.20560047030448914,
|
|
"rewards/accuracy_reward_long_step": 0.66796875,
|
|
"rewards/final_brier_reward_long_step": 0.7412210702896118,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7629580497741699,
|
|
"step": 332
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 578.0,
|
|
"completions/max_terminated_length": 578.0,
|
|
"completions/mean_length": 256.22265625,
|
|
"completions/mean_terminated_length": 256.22265625,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 0.5328,
|
|
"grad_norm": 0.03932815417647362,
|
|
"learning_rate": 5.195729537366548e-07,
|
|
"loss": -0.0093,
|
|
"num_tokens": 163447412.0,
|
|
"reward": 1.455832839012146,
|
|
"reward_std": 0.1748843789100647,
|
|
"rewards/accuracy_reward_long_step": 0.546875,
|
|
"rewards/final_brier_reward_long_step": 0.825259804725647,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.810571551322937,
|
|
"step": 333
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 538.0,
|
|
"completions/max_terminated_length": 538.0,
|
|
"completions/mean_length": 245.2265625,
|
|
"completions/mean_terminated_length": 245.2265625,
|
|
"completions/min_length": 97.0,
|
|
"completions/min_terminated_length": 97.0,
|
|
"epoch": 0.5344,
|
|
"grad_norm": 0.03335999324917793,
|
|
"learning_rate": 5.177935943060498e-07,
|
|
"loss": -0.0032,
|
|
"num_tokens": 163941862.0,
|
|
"reward": 1.3817181587219238,
|
|
"reward_std": 0.17352280020713806,
|
|
"rewards/accuracy_reward_long_step": 0.515625,
|
|
"rewards/final_brier_reward_long_step": 0.7708051204681396,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6935676336288452,
|
|
"step": 334
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 484.0,
|
|
"completions/max_terminated_length": 484.0,
|
|
"completions/mean_length": 236.14453125,
|
|
"completions/mean_terminated_length": 236.14453125,
|
|
"completions/min_length": 96.0,
|
|
"completions/min_terminated_length": 96.0,
|
|
"epoch": 0.536,
|
|
"grad_norm": 0.03453889861702919,
|
|
"learning_rate": 5.160142348754448e-07,
|
|
"loss": 0.0059,
|
|
"num_tokens": 164414147.0,
|
|
"reward": 1.4293192625045776,
|
|
"reward_std": 0.1838047057390213,
|
|
"rewards/accuracy_reward_long_step": 0.52734375,
|
|
"rewards/final_brier_reward_long_step": 0.8181675672531128,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7897346019744873,
|
|
"step": 335
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 448.0,
|
|
"completions/max_terminated_length": 448.0,
|
|
"completions/mean_length": 247.52734375,
|
|
"completions/mean_terminated_length": 247.52734375,
|
|
"completions/min_length": 115.0,
|
|
"completions/min_terminated_length": 115.0,
|
|
"epoch": 0.5376,
|
|
"grad_norm": 0.03347809612751007,
|
|
"learning_rate": 5.142348754448398e-07,
|
|
"loss": -0.0082,
|
|
"num_tokens": 164904322.0,
|
|
"reward": 1.4061800241470337,
|
|
"reward_std": 0.15439936518669128,
|
|
"rewards/accuracy_reward_long_step": 0.5546875,
|
|
"rewards/final_brier_reward_long_step": 0.740646481513977,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6653236746788025,
|
|
"step": 336
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 547.0,
|
|
"completions/max_terminated_length": 547.0,
|
|
"completions/mean_length": 246.3828125,
|
|
"completions/mean_terminated_length": 246.3828125,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.5392,
|
|
"grad_norm": 0.03456057235598564,
|
|
"learning_rate": 5.124555160142349e-07,
|
|
"loss": 0.002,
|
|
"num_tokens": 165401044.0,
|
|
"reward": 1.3541990518569946,
|
|
"reward_std": 0.1308155059814453,
|
|
"rewards/accuracy_reward_long_step": 0.515625,
|
|
"rewards/final_brier_reward_long_step": 0.6761799454689026,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6781162023544312,
|
|
"step": 337
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 693.0,
|
|
"completions/max_terminated_length": 693.0,
|
|
"completions/mean_length": 259.3046875,
|
|
"completions/mean_terminated_length": 259.3046875,
|
|
"completions/min_length": 134.0,
|
|
"completions/min_terminated_length": 134.0,
|
|
"epoch": 0.5408,
|
|
"grad_norm": 0.029813647270202637,
|
|
"learning_rate": 5.106761565836298e-07,
|
|
"loss": 0.0009,
|
|
"num_tokens": 165906058.0,
|
|
"reward": 1.3256361484527588,
|
|
"reward_std": 0.19530092179775238,
|
|
"rewards/accuracy_reward_long_step": 0.46484375,
|
|
"rewards/final_brier_reward_long_step": 0.670098066329956,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7730719447135925,
|
|
"step": 338
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 439.0,
|
|
"completions/max_terminated_length": 439.0,
|
|
"completions/mean_length": 246.8046875,
|
|
"completions/mean_terminated_length": 246.8046875,
|
|
"completions/min_length": 106.0,
|
|
"completions/min_terminated_length": 106.0,
|
|
"epoch": 0.5424,
|
|
"grad_norm": 0.03796224668622017,
|
|
"learning_rate": 5.088967971530249e-07,
|
|
"loss": 0.0075,
|
|
"num_tokens": 166401920.0,
|
|
"reward": 1.4465839862823486,
|
|
"reward_std": 0.207666277885437,
|
|
"rewards/accuracy_reward_long_step": 0.625,
|
|
"rewards/final_brier_reward_long_step": 0.7033705711364746,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.5829657316207886,
|
|
"step": 339
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 522.0,
|
|
"completions/max_terminated_length": 522.0,
|
|
"completions/mean_length": 260.37109375,
|
|
"completions/mean_terminated_length": 260.37109375,
|
|
"completions/min_length": 155.0,
|
|
"completions/min_terminated_length": 155.0,
|
|
"epoch": 0.544,
|
|
"grad_norm": 0.03487522527575493,
|
|
"learning_rate": 5.071174377224199e-07,
|
|
"loss": -0.0028,
|
|
"num_tokens": 166900079.0,
|
|
"reward": 1.364084243774414,
|
|
"reward_std": 0.12331333756446838,
|
|
"rewards/accuracy_reward_long_step": 0.50390625,
|
|
"rewards/final_brier_reward_long_step": 0.7763662934303284,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6643457412719727,
|
|
"step": 340
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 882.0,
|
|
"completions/max_terminated_length": 882.0,
|
|
"completions/mean_length": 253.23828125,
|
|
"completions/mean_terminated_length": 253.23828125,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.5456,
|
|
"grad_norm": 0.03571302443742752,
|
|
"learning_rate": 5.053380782918149e-07,
|
|
"loss": 0.0008,
|
|
"num_tokens": 167388084.0,
|
|
"reward": 1.4308526515960693,
|
|
"reward_std": 0.14994728565216064,
|
|
"rewards/accuracy_reward_long_step": 0.55078125,
|
|
"rewards/final_brier_reward_long_step": 0.7553993463516235,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7648867964744568,
|
|
"step": 341
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 547.0,
|
|
"completions/max_terminated_length": 547.0,
|
|
"completions/mean_length": 243.5703125,
|
|
"completions/mean_terminated_length": 243.5703125,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.5472,
|
|
"grad_norm": 0.034076888114213943,
|
|
"learning_rate": 5.0355871886121e-07,
|
|
"loss": -0.0049,
|
|
"num_tokens": 167867318.0,
|
|
"reward": 1.4327844381332397,
|
|
"reward_std": 0.17568854987621307,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.7377663850784302,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7433711290359497,
|
|
"step": 342
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 492.0,
|
|
"completions/max_terminated_length": 492.0,
|
|
"completions/mean_length": 231.46875,
|
|
"completions/mean_terminated_length": 231.46875,
|
|
"completions/min_length": 130.0,
|
|
"completions/min_terminated_length": 130.0,
|
|
"epoch": 0.5488,
|
|
"grad_norm": 0.03265485167503357,
|
|
"learning_rate": 5.01779359430605e-07,
|
|
"loss": 0.0058,
|
|
"num_tokens": 168334662.0,
|
|
"reward": 1.5790760517120361,
|
|
"reward_std": 0.20140470564365387,
|
|
"rewards/accuracy_reward_long_step": 0.69921875,
|
|
"rewards/final_brier_reward_long_step": 0.7507095336914062,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7765324115753174,
|
|
"step": 343
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 459.0,
|
|
"completions/max_terminated_length": 459.0,
|
|
"completions/mean_length": 242.12890625,
|
|
"completions/mean_terminated_length": 242.12890625,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.5504,
|
|
"grad_norm": 0.0313512459397316,
|
|
"learning_rate": 5e-07,
|
|
"loss": -0.0061,
|
|
"num_tokens": 168823871.0,
|
|
"reward": 1.3449064493179321,
|
|
"reward_std": 0.16971346735954285,
|
|
"rewards/accuracy_reward_long_step": 0.47265625,
|
|
"rewards/final_brier_reward_long_step": 0.7380057573318481,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7509950995445251,
|
|
"step": 344
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 432.0,
|
|
"completions/max_terminated_length": 432.0,
|
|
"completions/mean_length": 240.40625,
|
|
"completions/mean_terminated_length": 240.40625,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.552,
|
|
"grad_norm": 0.035064004361629486,
|
|
"learning_rate": 4.98220640569395e-07,
|
|
"loss": 0.0009,
|
|
"num_tokens": 169308207.0,
|
|
"reward": 1.447171688079834,
|
|
"reward_std": 0.1690702587366104,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.766781210899353,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7406556606292725,
|
|
"step": 345
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 542.0,
|
|
"completions/max_terminated_length": 542.0,
|
|
"completions/mean_length": 255.90234375,
|
|
"completions/mean_terminated_length": 255.90234375,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.5536,
|
|
"grad_norm": 0.03382926061749458,
|
|
"learning_rate": 4.9644128113879e-07,
|
|
"loss": 0.0094,
|
|
"num_tokens": 169790286.0,
|
|
"reward": 1.3020596504211426,
|
|
"reward_std": 0.1404658854007721,
|
|
"rewards/accuracy_reward_long_step": 0.4375,
|
|
"rewards/final_brier_reward_long_step": 0.7163107991218567,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.741927981376648,
|
|
"step": 346
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 472.0,
|
|
"completions/max_terminated_length": 472.0,
|
|
"completions/mean_length": 238.2265625,
|
|
"completions/mean_terminated_length": 238.2265625,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.5552,
|
|
"grad_norm": 0.04365375638008118,
|
|
"learning_rate": 4.94661921708185e-07,
|
|
"loss": 0.0012,
|
|
"num_tokens": 170253896.0,
|
|
"reward": 1.3717888593673706,
|
|
"reward_std": 0.159200519323349,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.7934491634368896,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8187063932418823,
|
|
"step": 347
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 448.0,
|
|
"completions/max_terminated_length": 448.0,
|
|
"completions/mean_length": 253.41796875,
|
|
"completions/mean_terminated_length": 253.41796875,
|
|
"completions/min_length": 137.0,
|
|
"completions/min_terminated_length": 137.0,
|
|
"epoch": 0.5568,
|
|
"grad_norm": 0.034737542271614075,
|
|
"learning_rate": 4.9288256227758e-07,
|
|
"loss": 0.0096,
|
|
"num_tokens": 170751355.0,
|
|
"reward": 1.2173829078674316,
|
|
"reward_std": 0.1510533094406128,
|
|
"rewards/accuracy_reward_long_step": 0.375,
|
|
"rewards/final_brier_reward_long_step": 0.6595523357391357,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7099793553352356,
|
|
"step": 348
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 456.0,
|
|
"completions/max_terminated_length": 456.0,
|
|
"completions/mean_length": 239.58203125,
|
|
"completions/mean_terminated_length": 239.58203125,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.5584,
|
|
"grad_norm": 0.04001186043024063,
|
|
"learning_rate": 4.91103202846975e-07,
|
|
"loss": -0.002,
|
|
"num_tokens": 171235744.0,
|
|
"reward": 1.4833966493606567,
|
|
"reward_std": 0.1600235551595688,
|
|
"rewards/accuracy_reward_long_step": 0.5859375,
|
|
"rewards/final_brier_reward_long_step": 0.8168105483055115,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7730263471603394,
|
|
"step": 349
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 571.0,
|
|
"completions/max_terminated_length": 571.0,
|
|
"completions/mean_length": 255.8125,
|
|
"completions/mean_terminated_length": 255.8125,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.56,
|
|
"grad_norm": 0.03591832518577576,
|
|
"learning_rate": 4.893238434163701e-07,
|
|
"loss": -0.0068,
|
|
"num_tokens": 171725112.0,
|
|
"reward": 1.219707727432251,
|
|
"reward_std": 0.14950095117092133,
|
|
"rewards/accuracy_reward_long_step": 0.37109375,
|
|
"rewards/final_brier_reward_long_step": 0.6841285228729248,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7103271484375,
|
|
"step": 350
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 608.0,
|
|
"completions/max_terminated_length": 608.0,
|
|
"completions/mean_length": 253.22265625,
|
|
"completions/mean_terminated_length": 253.22265625,
|
|
"completions/min_length": 80.0,
|
|
"completions/min_terminated_length": 80.0,
|
|
"epoch": 0.5616,
|
|
"grad_norm": 0.033379342406988144,
|
|
"learning_rate": 4.875444839857651e-07,
|
|
"loss": 0.0029,
|
|
"num_tokens": 172224321.0,
|
|
"reward": 1.3003888130187988,
|
|
"reward_std": 0.14580082893371582,
|
|
"rewards/accuracy_reward_long_step": 0.41015625,
|
|
"rewards/final_brier_reward_long_step": 0.7715871334075928,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7893432974815369,
|
|
"step": 351
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 505.0,
|
|
"completions/max_terminated_length": 505.0,
|
|
"completions/mean_length": 247.00390625,
|
|
"completions/mean_terminated_length": 247.00390625,
|
|
"completions/min_length": 97.0,
|
|
"completions/min_terminated_length": 97.0,
|
|
"epoch": 0.5632,
|
|
"grad_norm": 0.03563377261161804,
|
|
"learning_rate": 4.857651245551601e-07,
|
|
"loss": -0.0141,
|
|
"num_tokens": 172723978.0,
|
|
"reward": 1.2515490055084229,
|
|
"reward_std": 0.20609885454177856,
|
|
"rewards/accuracy_reward_long_step": 0.42578125,
|
|
"rewards/final_brier_reward_long_step": 0.5958093404769897,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7072615027427673,
|
|
"step": 352
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 444.0,
|
|
"completions/max_terminated_length": 444.0,
|
|
"completions/mean_length": 240.86328125,
|
|
"completions/mean_terminated_length": 240.86328125,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.5648,
|
|
"grad_norm": 0.030260441824793816,
|
|
"learning_rate": 4.839857651245551e-07,
|
|
"loss": 0.0111,
|
|
"num_tokens": 173228511.0,
|
|
"reward": 1.430063009262085,
|
|
"reward_std": 0.10446056723594666,
|
|
"rewards/accuracy_reward_long_step": 0.5234375,
|
|
"rewards/final_brier_reward_long_step": 0.7712934017181396,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8552085757255554,
|
|
"step": 353
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 423.0,
|
|
"completions/max_terminated_length": 423.0,
|
|
"completions/mean_length": 246.74609375,
|
|
"completions/mean_terminated_length": 246.74609375,
|
|
"completions/min_length": 116.0,
|
|
"completions/min_terminated_length": 116.0,
|
|
"epoch": 0.5664,
|
|
"grad_norm": 0.03534315153956413,
|
|
"learning_rate": 4.822064056939501e-07,
|
|
"loss": 0.007,
|
|
"num_tokens": 173716982.0,
|
|
"reward": 1.3442493677139282,
|
|
"reward_std": 0.20917916297912598,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.7205374836921692,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7345851063728333,
|
|
"step": 354
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 458.0,
|
|
"completions/max_terminated_length": 458.0,
|
|
"completions/mean_length": 243.53125,
|
|
"completions/mean_terminated_length": 243.53125,
|
|
"completions/min_length": 140.0,
|
|
"completions/min_terminated_length": 140.0,
|
|
"epoch": 0.568,
|
|
"grad_norm": 0.046947211027145386,
|
|
"learning_rate": 4.804270462633451e-07,
|
|
"loss": -0.0176,
|
|
"num_tokens": 174209438.0,
|
|
"reward": 1.4447617530822754,
|
|
"reward_std": 0.1805291771888733,
|
|
"rewards/accuracy_reward_long_step": 0.57421875,
|
|
"rewards/final_brier_reward_long_step": 0.7230343818664551,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7591376304626465,
|
|
"step": 355
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 611.0,
|
|
"completions/max_terminated_length": 611.0,
|
|
"completions/mean_length": 244.32421875,
|
|
"completions/mean_terminated_length": 244.32421875,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.5696,
|
|
"grad_norm": 0.03166520223021507,
|
|
"learning_rate": 4.786476868327403e-07,
|
|
"loss": 0.0097,
|
|
"num_tokens": 174683969.0,
|
|
"reward": 1.4561214447021484,
|
|
"reward_std": 0.1352284848690033,
|
|
"rewards/accuracy_reward_long_step": 0.57421875,
|
|
"rewards/final_brier_reward_long_step": 0.7665960788726807,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7610146999359131,
|
|
"step": 356
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 540.0,
|
|
"completions/max_terminated_length": 540.0,
|
|
"completions/mean_length": 265.96875,
|
|
"completions/mean_terminated_length": 265.96875,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.5712,
|
|
"grad_norm": 0.03730427846312523,
|
|
"learning_rate": 4.768683274021353e-07,
|
|
"loss": -0.0075,
|
|
"num_tokens": 175174505.0,
|
|
"reward": 1.296769142150879,
|
|
"reward_std": 0.15302547812461853,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.7661605477333069,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7646663188934326,
|
|
"step": 357
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 571.0,
|
|
"completions/max_terminated_length": 571.0,
|
|
"completions/mean_length": 258.87890625,
|
|
"completions/mean_terminated_length": 258.87890625,
|
|
"completions/min_length": 144.0,
|
|
"completions/min_terminated_length": 144.0,
|
|
"epoch": 0.5728,
|
|
"grad_norm": 0.038667719811201096,
|
|
"learning_rate": 4.7508896797153023e-07,
|
|
"loss": 0.01,
|
|
"num_tokens": 175667706.0,
|
|
"reward": 1.4412386417388916,
|
|
"reward_std": 0.21622003614902496,
|
|
"rewards/accuracy_reward_long_step": 0.546875,
|
|
"rewards/final_brier_reward_long_step": 0.7790859341621399,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8061808347702026,
|
|
"step": 358
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 449.0,
|
|
"completions/max_terminated_length": 449.0,
|
|
"completions/mean_length": 249.40234375,
|
|
"completions/mean_terminated_length": 249.40234375,
|
|
"completions/min_length": 106.0,
|
|
"completions/min_terminated_length": 106.0,
|
|
"epoch": 0.5744,
|
|
"grad_norm": 0.03672811761498451,
|
|
"learning_rate": 4.733096085409252e-07,
|
|
"loss": 0.0064,
|
|
"num_tokens": 176151705.0,
|
|
"reward": 1.4096336364746094,
|
|
"reward_std": 0.14016187191009521,
|
|
"rewards/accuracy_reward_long_step": 0.51171875,
|
|
"rewards/final_brier_reward_long_step": 0.8072555065155029,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7844040393829346,
|
|
"step": 359
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 491.0,
|
|
"completions/max_terminated_length": 491.0,
|
|
"completions/mean_length": 256.26953125,
|
|
"completions/mean_terminated_length": 256.26953125,
|
|
"completions/min_length": 111.0,
|
|
"completions/min_terminated_length": 111.0,
|
|
"epoch": 0.576,
|
|
"grad_norm": 0.031908176839351654,
|
|
"learning_rate": 4.7153024911032026e-07,
|
|
"loss": 0.0018,
|
|
"num_tokens": 176654878.0,
|
|
"reward": 1.2949192523956299,
|
|
"reward_std": 0.1339997947216034,
|
|
"rewards/accuracy_reward_long_step": 0.41015625,
|
|
"rewards/final_brier_reward_long_step": 0.7606054544448853,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7784462571144104,
|
|
"step": 360
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 586.0,
|
|
"completions/max_terminated_length": 586.0,
|
|
"completions/mean_length": 260.42578125,
|
|
"completions/mean_terminated_length": 260.42578125,
|
|
"completions/min_length": 145.0,
|
|
"completions/min_terminated_length": 145.0,
|
|
"epoch": 0.5776,
|
|
"grad_norm": 0.03353104740381241,
|
|
"learning_rate": 4.697508896797153e-07,
|
|
"loss": -0.0028,
|
|
"num_tokens": 177158483.0,
|
|
"reward": 1.3936898708343506,
|
|
"reward_std": 0.17937517166137695,
|
|
"rewards/accuracy_reward_long_step": 0.51953125,
|
|
"rewards/final_brier_reward_long_step": 0.763106644153595,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7335278391838074,
|
|
"step": 361
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 533.0,
|
|
"completions/max_terminated_length": 533.0,
|
|
"completions/mean_length": 250.55078125,
|
|
"completions/mean_terminated_length": 250.55078125,
|
|
"completions/min_length": 155.0,
|
|
"completions/min_terminated_length": 155.0,
|
|
"epoch": 0.5792,
|
|
"grad_norm": 0.0516292005777359,
|
|
"learning_rate": 4.679715302491103e-07,
|
|
"loss": 0.002,
|
|
"num_tokens": 177665080.0,
|
|
"reward": 1.3700807094573975,
|
|
"reward_std": 0.1468803435564041,
|
|
"rewards/accuracy_reward_long_step": 0.5234375,
|
|
"rewards/final_brier_reward_long_step": 0.6796808242797852,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7068922519683838,
|
|
"step": 362
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 593.0,
|
|
"completions/max_terminated_length": 593.0,
|
|
"completions/mean_length": 257.5625,
|
|
"completions/mean_terminated_length": 257.5625,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.5808,
|
|
"grad_norm": 0.036485347896814346,
|
|
"learning_rate": 4.661921708185053e-07,
|
|
"loss": -0.0097,
|
|
"num_tokens": 178161496.0,
|
|
"reward": 1.1560263633728027,
|
|
"reward_std": 0.13470560312271118,
|
|
"rewards/accuracy_reward_long_step": 0.2890625,
|
|
"rewards/final_brier_reward_long_step": 0.6383723020553589,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.829483687877655,
|
|
"step": 363
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 399.0,
|
|
"completions/max_terminated_length": 399.0,
|
|
"completions/mean_length": 231.03515625,
|
|
"completions/mean_terminated_length": 231.03515625,
|
|
"completions/min_length": 129.0,
|
|
"completions/min_terminated_length": 129.0,
|
|
"epoch": 0.5824,
|
|
"grad_norm": 0.039585795253515244,
|
|
"learning_rate": 4.644128113879003e-07,
|
|
"loss": 0.0097,
|
|
"num_tokens": 178643185.0,
|
|
"reward": 1.4548025131225586,
|
|
"reward_std": 0.1070779412984848,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.7518347501754761,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7861253023147583,
|
|
"step": 364
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 505.0,
|
|
"completions/max_terminated_length": 505.0,
|
|
"completions/mean_length": 240.80078125,
|
|
"completions/mean_terminated_length": 240.80078125,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.584,
|
|
"grad_norm": 0.037114016711711884,
|
|
"learning_rate": 4.626334519572954e-07,
|
|
"loss": -0.0042,
|
|
"num_tokens": 179134550.0,
|
|
"reward": 1.3695037364959717,
|
|
"reward_std": 0.16490252315998077,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.7435758113861084,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8281888365745544,
|
|
"step": 365
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 478.0,
|
|
"completions/max_terminated_length": 478.0,
|
|
"completions/mean_length": 236.41796875,
|
|
"completions/mean_terminated_length": 236.41796875,
|
|
"completions/min_length": 101.0,
|
|
"completions/min_terminated_length": 101.0,
|
|
"epoch": 0.5856,
|
|
"grad_norm": 0.03539412468671799,
|
|
"learning_rate": 4.608540925266904e-07,
|
|
"loss": 0.0036,
|
|
"num_tokens": 179618601.0,
|
|
"reward": 1.3902171850204468,
|
|
"reward_std": 0.11728814244270325,
|
|
"rewards/accuracy_reward_long_step": 0.51953125,
|
|
"rewards/final_brier_reward_long_step": 0.767492949962616,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7152509689331055,
|
|
"step": 366
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 467.0,
|
|
"completions/max_terminated_length": 467.0,
|
|
"completions/mean_length": 250.51953125,
|
|
"completions/mean_terminated_length": 250.51953125,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.5872,
|
|
"grad_norm": 0.03185239061713219,
|
|
"learning_rate": 4.590747330960854e-07,
|
|
"loss": 0.0033,
|
|
"num_tokens": 180097414.0,
|
|
"reward": 1.4346997737884521,
|
|
"reward_std": 0.12333646416664124,
|
|
"rewards/accuracy_reward_long_step": 0.51953125,
|
|
"rewards/final_brier_reward_long_step": 0.8788655996322632,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7818087339401245,
|
|
"step": 367
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 491.0,
|
|
"completions/max_terminated_length": 491.0,
|
|
"completions/mean_length": 243.2421875,
|
|
"completions/mean_terminated_length": 243.2421875,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.5888,
|
|
"grad_norm": 0.05108208209276199,
|
|
"learning_rate": 4.5729537366548043e-07,
|
|
"loss": 0.0105,
|
|
"num_tokens": 180588972.0,
|
|
"reward": 1.4830061197280884,
|
|
"reward_std": 0.13395658135414124,
|
|
"rewards/accuracy_reward_long_step": 0.59375,
|
|
"rewards/final_brier_reward_long_step": 0.7892441749572754,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7677804231643677,
|
|
"step": 368
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 488.0,
|
|
"completions/max_terminated_length": 488.0,
|
|
"completions/mean_length": 250.55859375,
|
|
"completions/mean_terminated_length": 250.55859375,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.5904,
|
|
"grad_norm": 0.033352505415678024,
|
|
"learning_rate": 4.555160142348754e-07,
|
|
"loss": -0.0026,
|
|
"num_tokens": 181083883.0,
|
|
"reward": 1.5408804416656494,
|
|
"reward_std": 0.17830899357795715,
|
|
"rewards/accuracy_reward_long_step": 0.62109375,
|
|
"rewards/final_brier_reward_long_step": 0.8529410362243652,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.826205849647522,
|
|
"step": 369
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 660.0,
|
|
"completions/max_terminated_length": 660.0,
|
|
"completions/mean_length": 249.2734375,
|
|
"completions/mean_terminated_length": 249.2734375,
|
|
"completions/min_length": 95.0,
|
|
"completions/min_terminated_length": 95.0,
|
|
"epoch": 0.592,
|
|
"grad_norm": 0.03348240256309509,
|
|
"learning_rate": 4.537366548042704e-07,
|
|
"loss": 0.006,
|
|
"num_tokens": 181570745.0,
|
|
"reward": 1.3028314113616943,
|
|
"reward_std": 0.21476979553699493,
|
|
"rewards/accuracy_reward_long_step": 0.44140625,
|
|
"rewards/final_brier_reward_long_step": 0.7177179455757141,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7357947826385498,
|
|
"step": 370
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 470.0,
|
|
"completions/max_terminated_length": 470.0,
|
|
"completions/mean_length": 235.62109375,
|
|
"completions/mean_terminated_length": 235.62109375,
|
|
"completions/min_length": 126.0,
|
|
"completions/min_terminated_length": 126.0,
|
|
"epoch": 0.5936,
|
|
"grad_norm": 0.029231376945972443,
|
|
"learning_rate": 4.519572953736655e-07,
|
|
"loss": -0.0131,
|
|
"num_tokens": 182061952.0,
|
|
"reward": 1.4700736999511719,
|
|
"reward_std": 0.0870005190372467,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.7968558669090271,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.833439290523529,
|
|
"step": 371
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 489.0,
|
|
"completions/max_terminated_length": 489.0,
|
|
"completions/mean_length": 244.97265625,
|
|
"completions/mean_terminated_length": 244.97265625,
|
|
"completions/min_length": 143.0,
|
|
"completions/min_terminated_length": 143.0,
|
|
"epoch": 0.5952,
|
|
"grad_norm": 0.0341234989464283,
|
|
"learning_rate": 4.501779359430605e-07,
|
|
"loss": -0.003,
|
|
"num_tokens": 182547297.0,
|
|
"reward": 1.5000258684158325,
|
|
"reward_std": 0.12720796465873718,
|
|
"rewards/accuracy_reward_long_step": 0.609375,
|
|
"rewards/final_brier_reward_long_step": 0.8284410238265991,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7341622710227966,
|
|
"step": 372
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 557.0,
|
|
"completions/max_terminated_length": 557.0,
|
|
"completions/mean_length": 272.23046875,
|
|
"completions/mean_terminated_length": 272.23046875,
|
|
"completions/min_length": 111.0,
|
|
"completions/min_terminated_length": 111.0,
|
|
"epoch": 0.5968,
|
|
"grad_norm": 0.032760150730609894,
|
|
"learning_rate": 4.483985765124555e-07,
|
|
"loss": 0.0008,
|
|
"num_tokens": 183047828.0,
|
|
"reward": 1.2245934009552002,
|
|
"reward_std": 0.14938510954380035,
|
|
"rewards/accuracy_reward_long_step": 0.33984375,
|
|
"rewards/final_brier_reward_long_step": 0.738226592540741,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.800771951675415,
|
|
"step": 373
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 484.0,
|
|
"completions/max_terminated_length": 484.0,
|
|
"completions/mean_length": 236.62109375,
|
|
"completions/mean_terminated_length": 236.62109375,
|
|
"completions/min_length": 139.0,
|
|
"completions/min_terminated_length": 139.0,
|
|
"epoch": 0.5984,
|
|
"grad_norm": 0.035608965903520584,
|
|
"learning_rate": 4.466192170818505e-07,
|
|
"loss": -0.0005,
|
|
"num_tokens": 183518363.0,
|
|
"reward": 1.4495090246200562,
|
|
"reward_std": 0.1940980851650238,
|
|
"rewards/accuracy_reward_long_step": 0.58984375,
|
|
"rewards/final_brier_reward_long_step": 0.6856546401977539,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7530063390731812,
|
|
"step": 374
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 480.0,
|
|
"completions/max_terminated_length": 480.0,
|
|
"completions/mean_length": 253.03125,
|
|
"completions/mean_terminated_length": 253.03125,
|
|
"completions/min_length": 145.0,
|
|
"completions/min_terminated_length": 145.0,
|
|
"epoch": 0.6,
|
|
"grad_norm": 0.03018086962401867,
|
|
"learning_rate": 4.4483985765124553e-07,
|
|
"loss": -0.0038,
|
|
"num_tokens": 184007499.0,
|
|
"reward": 1.4177751541137695,
|
|
"reward_std": 0.1175907552242279,
|
|
"rewards/accuracy_reward_long_step": 0.5234375,
|
|
"rewards/final_brier_reward_long_step": 0.830146849155426,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7550168037414551,
|
|
"step": 375
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 414.0,
|
|
"completions/max_terminated_length": 414.0,
|
|
"completions/mean_length": 231.6640625,
|
|
"completions/mean_terminated_length": 231.6640625,
|
|
"completions/min_length": 152.0,
|
|
"completions/min_terminated_length": 152.0,
|
|
"epoch": 0.6016,
|
|
"grad_norm": 0.03529973700642586,
|
|
"learning_rate": 4.4306049822064055e-07,
|
|
"loss": -0.005,
|
|
"num_tokens": 184481933.0,
|
|
"reward": 1.4107120037078857,
|
|
"reward_std": 0.17283451557159424,
|
|
"rewards/accuracy_reward_long_step": 0.55078125,
|
|
"rewards/final_brier_reward_long_step": 0.6837781667709351,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7559454441070557,
|
|
"step": 376
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 490.0,
|
|
"completions/max_terminated_length": 490.0,
|
|
"completions/mean_length": 248.328125,
|
|
"completions/mean_terminated_length": 248.328125,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.6032,
|
|
"grad_norm": 0.032051555812358856,
|
|
"learning_rate": 4.412811387900356e-07,
|
|
"loss": 0.0033,
|
|
"num_tokens": 184971921.0,
|
|
"reward": 1.4520785808563232,
|
|
"reward_std": 0.23111991584300995,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.7146296501159668,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8124346733093262,
|
|
"step": 377
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 507.0,
|
|
"completions/max_terminated_length": 507.0,
|
|
"completions/mean_length": 234.9921875,
|
|
"completions/mean_terminated_length": 234.9921875,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.6048,
|
|
"grad_norm": 0.03357694298028946,
|
|
"learning_rate": 4.395017793594306e-07,
|
|
"loss": 0.0024,
|
|
"num_tokens": 185440015.0,
|
|
"reward": 1.429476261138916,
|
|
"reward_std": 0.119395412504673,
|
|
"rewards/accuracy_reward_long_step": 0.53125,
|
|
"rewards/final_brier_reward_long_step": 0.7937347888946533,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7991704940795898,
|
|
"step": 378
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 560.0,
|
|
"completions/max_terminated_length": 560.0,
|
|
"completions/mean_length": 260.4453125,
|
|
"completions/mean_terminated_length": 260.4453125,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.6064,
|
|
"grad_norm": 0.029874242842197418,
|
|
"learning_rate": 4.377224199288256e-07,
|
|
"loss": 0.0054,
|
|
"num_tokens": 185943985.0,
|
|
"reward": 1.489135980606079,
|
|
"reward_std": 0.123079814016819,
|
|
"rewards/accuracy_reward_long_step": 0.5546875,
|
|
"rewards/final_brier_reward_long_step": 0.8251116871833801,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.9126821160316467,
|
|
"step": 379
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 592.0,
|
|
"completions/max_terminated_length": 592.0,
|
|
"completions/mean_length": 249.58203125,
|
|
"completions/mean_terminated_length": 249.58203125,
|
|
"completions/min_length": 141.0,
|
|
"completions/min_terminated_length": 141.0,
|
|
"epoch": 0.608,
|
|
"grad_norm": 0.03442827984690666,
|
|
"learning_rate": 4.359430604982206e-07,
|
|
"loss": -0.0082,
|
|
"num_tokens": 186443814.0,
|
|
"reward": 1.2993438243865967,
|
|
"reward_std": 0.12685778737068176,
|
|
"rewards/accuracy_reward_long_step": 0.40625,
|
|
"rewards/final_brier_reward_long_step": 0.7798289060592651,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7925466299057007,
|
|
"step": 380
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 572.0,
|
|
"completions/max_terminated_length": 572.0,
|
|
"completions/mean_length": 254.453125,
|
|
"completions/mean_terminated_length": 254.453125,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.6096,
|
|
"grad_norm": 0.032972000539302826,
|
|
"learning_rate": 4.341637010676156e-07,
|
|
"loss": -0.002,
|
|
"num_tokens": 186948906.0,
|
|
"reward": 1.3653643131256104,
|
|
"reward_std": 0.1496572494506836,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.7358413934707642,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7881159782409668,
|
|
"step": 381
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 460.0,
|
|
"completions/max_terminated_length": 460.0,
|
|
"completions/mean_length": 243.390625,
|
|
"completions/mean_terminated_length": 243.390625,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 0.6112,
|
|
"grad_norm": 0.03480248898267746,
|
|
"learning_rate": 4.3238434163701063e-07,
|
|
"loss": 0.0027,
|
|
"num_tokens": 187445558.0,
|
|
"reward": 1.545555830001831,
|
|
"reward_std": 0.17457936704158783,
|
|
"rewards/accuracy_reward_long_step": 0.6171875,
|
|
"rewards/final_brier_reward_long_step": 0.865576982498169,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8478966355323792,
|
|
"step": 382
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 564.0,
|
|
"completions/max_terminated_length": 564.0,
|
|
"completions/mean_length": 252.15234375,
|
|
"completions/mean_terminated_length": 252.15234375,
|
|
"completions/min_length": 126.0,
|
|
"completions/min_terminated_length": 126.0,
|
|
"epoch": 0.6128,
|
|
"grad_norm": 0.03224232792854309,
|
|
"learning_rate": 4.306049822064057e-07,
|
|
"loss": 0.0002,
|
|
"num_tokens": 187939005.0,
|
|
"reward": 1.5401490926742554,
|
|
"reward_std": 0.1554555892944336,
|
|
"rewards/accuracy_reward_long_step": 0.625,
|
|
"rewards/final_brier_reward_long_step": 0.8096957206726074,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8509008288383484,
|
|
"step": 383
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 557.0,
|
|
"completions/max_terminated_length": 557.0,
|
|
"completions/mean_length": 249.1328125,
|
|
"completions/mean_terminated_length": 249.1328125,
|
|
"completions/min_length": 116.0,
|
|
"completions/min_terminated_length": 116.0,
|
|
"epoch": 0.6144,
|
|
"grad_norm": 0.029847221449017525,
|
|
"learning_rate": 4.288256227758007e-07,
|
|
"loss": 0.0013,
|
|
"num_tokens": 188446527.0,
|
|
"reward": 1.3999953269958496,
|
|
"reward_std": 0.09057177603244781,
|
|
"rewards/accuracy_reward_long_step": 0.5078125,
|
|
"rewards/final_brier_reward_long_step": 0.7741625308990479,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.794569194316864,
|
|
"step": 384
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 580.0,
|
|
"completions/max_terminated_length": 580.0,
|
|
"completions/mean_length": 264.25390625,
|
|
"completions/mean_terminated_length": 264.25390625,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.616,
|
|
"grad_norm": 0.031437478959560394,
|
|
"learning_rate": 4.2704626334519573e-07,
|
|
"loss": 0.0145,
|
|
"num_tokens": 188934968.0,
|
|
"reward": 1.4329785108566284,
|
|
"reward_std": 0.14642232656478882,
|
|
"rewards/accuracy_reward_long_step": 0.54296875,
|
|
"rewards/final_brier_reward_long_step": 0.7519199252128601,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8081189393997192,
|
|
"step": 385
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 450.0,
|
|
"completions/max_terminated_length": 450.0,
|
|
"completions/mean_length": 246.9140625,
|
|
"completions/mean_terminated_length": 246.9140625,
|
|
"completions/min_length": 139.0,
|
|
"completions/min_terminated_length": 139.0,
|
|
"epoch": 0.6176,
|
|
"grad_norm": 0.04302318021655083,
|
|
"learning_rate": 4.2526690391459074e-07,
|
|
"loss": -0.0079,
|
|
"num_tokens": 189434826.0,
|
|
"reward": 1.3921318054199219,
|
|
"reward_std": 0.17010337114334106,
|
|
"rewards/accuracy_reward_long_step": 0.54296875,
|
|
"rewards/final_brier_reward_long_step": 0.7150309085845947,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6816216111183167,
|
|
"step": 386
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 590.0,
|
|
"completions/max_terminated_length": 590.0,
|
|
"completions/mean_length": 254.37890625,
|
|
"completions/mean_terminated_length": 254.37890625,
|
|
"completions/min_length": 151.0,
|
|
"completions/min_terminated_length": 151.0,
|
|
"epoch": 0.6192,
|
|
"grad_norm": 0.03536539152264595,
|
|
"learning_rate": 4.2348754448398576e-07,
|
|
"loss": -0.0079,
|
|
"num_tokens": 189913979.0,
|
|
"reward": 1.4037388563156128,
|
|
"reward_std": 0.11961972713470459,
|
|
"rewards/accuracy_reward_long_step": 0.515625,
|
|
"rewards/final_brier_reward_long_step": 0.738335907459259,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8141195178031921,
|
|
"step": 387
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 506.0,
|
|
"completions/max_terminated_length": 506.0,
|
|
"completions/mean_length": 266.3203125,
|
|
"completions/mean_terminated_length": 266.3203125,
|
|
"completions/min_length": 141.0,
|
|
"completions/min_terminated_length": 141.0,
|
|
"epoch": 0.6208,
|
|
"grad_norm": 0.03417327627539635,
|
|
"learning_rate": 4.217081850533807e-07,
|
|
"loss": -0.0023,
|
|
"num_tokens": 190421693.0,
|
|
"reward": 1.4262065887451172,
|
|
"reward_std": 0.10677627474069595,
|
|
"rewards/accuracy_reward_long_step": 0.53515625,
|
|
"rewards/final_brier_reward_long_step": 0.7934492230415344,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7707524299621582,
|
|
"step": 388
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 442.0,
|
|
"completions/max_terminated_length": 442.0,
|
|
"completions/mean_length": 267.72265625,
|
|
"completions/mean_terminated_length": 267.72265625,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.6224,
|
|
"grad_norm": 0.03453484922647476,
|
|
"learning_rate": 4.199288256227758e-07,
|
|
"loss": 0.0004,
|
|
"num_tokens": 190935686.0,
|
|
"reward": 1.2190345525741577,
|
|
"reward_std": 0.14494457840919495,
|
|
"rewards/accuracy_reward_long_step": 0.33984375,
|
|
"rewards/final_brier_reward_long_step": 0.7603257894515991,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.756437361240387,
|
|
"step": 389
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 447.0,
|
|
"completions/max_terminated_length": 447.0,
|
|
"completions/mean_length": 248.13671875,
|
|
"completions/mean_terminated_length": 248.13671875,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.624,
|
|
"grad_norm": 0.033618371933698654,
|
|
"learning_rate": 4.181494661921708e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 191423305.0,
|
|
"reward": 1.367628812789917,
|
|
"reward_std": 0.13067224621772766,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.7947276830673218,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7695374488830566,
|
|
"step": 390
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 564.0,
|
|
"completions/max_terminated_length": 564.0,
|
|
"completions/mean_length": 263.71875,
|
|
"completions/mean_terminated_length": 263.71875,
|
|
"completions/min_length": 142.0,
|
|
"completions/min_terminated_length": 142.0,
|
|
"epoch": 0.6256,
|
|
"grad_norm": 0.0348045788705349,
|
|
"learning_rate": 4.163701067615658e-07,
|
|
"loss": 0.0067,
|
|
"num_tokens": 191928945.0,
|
|
"reward": 1.6183103322982788,
|
|
"reward_std": 0.16025137901306152,
|
|
"rewards/accuracy_reward_long_step": 0.71875,
|
|
"rewards/final_brier_reward_long_step": 0.8521628379821777,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7538907527923584,
|
|
"step": 391
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 599.0,
|
|
"completions/max_terminated_length": 599.0,
|
|
"completions/mean_length": 265.98828125,
|
|
"completions/mean_terminated_length": 265.98828125,
|
|
"completions/min_length": 164.0,
|
|
"completions/min_terminated_length": 164.0,
|
|
"epoch": 0.6272,
|
|
"grad_norm": 0.028777770698070526,
|
|
"learning_rate": 4.1459074733096083e-07,
|
|
"loss": 0.0073,
|
|
"num_tokens": 192431814.0,
|
|
"reward": 1.4733824729919434,
|
|
"reward_std": 0.14600692689418793,
|
|
"rewards/accuracy_reward_long_step": 0.5859375,
|
|
"rewards/final_brier_reward_long_step": 0.7584691047668457,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7913106679916382,
|
|
"step": 392
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 462.0,
|
|
"completions/max_terminated_length": 462.0,
|
|
"completions/mean_length": 263.95703125,
|
|
"completions/mean_terminated_length": 263.95703125,
|
|
"completions/min_length": 161.0,
|
|
"completions/min_terminated_length": 161.0,
|
|
"epoch": 0.6288,
|
|
"grad_norm": 0.03086771070957184,
|
|
"learning_rate": 4.1281138790035585e-07,
|
|
"loss": -0.014,
|
|
"num_tokens": 192932539.0,
|
|
"reward": 1.2810169458389282,
|
|
"reward_std": 0.08168387413024902,
|
|
"rewards/accuracy_reward_long_step": 0.43359375,
|
|
"rewards/final_brier_reward_long_step": 0.6735238432884216,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7239813804626465,
|
|
"step": 393
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 504.0,
|
|
"completions/max_terminated_length": 504.0,
|
|
"completions/mean_length": 252.90625,
|
|
"completions/mean_terminated_length": 253.89805603027344,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 139.0,
|
|
"epoch": 0.6304,
|
|
"grad_norm": 0.033842138946056366,
|
|
"learning_rate": 4.1103202846975086e-07,
|
|
"loss": -0.0147,
|
|
"num_tokens": 193433083.0,
|
|
"reward": 1.4617502689361572,
|
|
"reward_std": 0.16944709420204163,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.8041574358940125,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7694060206413269,
|
|
"step": 394
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 527.0,
|
|
"completions/max_terminated_length": 527.0,
|
|
"completions/mean_length": 252.5546875,
|
|
"completions/mean_terminated_length": 252.5546875,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.632,
|
|
"grad_norm": 0.035002097487449646,
|
|
"learning_rate": 4.0925266903914593e-07,
|
|
"loss": 0.0012,
|
|
"num_tokens": 193924961.0,
|
|
"reward": 1.5998687744140625,
|
|
"reward_std": 0.14062157273292542,
|
|
"rewards/accuracy_reward_long_step": 0.69140625,
|
|
"rewards/final_brier_reward_long_step": 0.7885218858718872,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8453285694122314,
|
|
"step": 395
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 537.0,
|
|
"completions/max_terminated_length": 537.0,
|
|
"completions/mean_length": 264.9765625,
|
|
"completions/mean_terminated_length": 264.9765625,
|
|
"completions/min_length": 88.0,
|
|
"completions/min_terminated_length": 88.0,
|
|
"epoch": 0.6336,
|
|
"grad_norm": 0.0325743593275547,
|
|
"learning_rate": 4.0747330960854094e-07,
|
|
"loss": 0.0099,
|
|
"num_tokens": 194417523.0,
|
|
"reward": 1.5281362533569336,
|
|
"reward_std": 0.14003178477287292,
|
|
"rewards/accuracy_reward_long_step": 0.6171875,
|
|
"rewards/final_brier_reward_long_step": 0.816343367099762,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8274516463279724,
|
|
"step": 396
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 524.0,
|
|
"completions/max_terminated_length": 524.0,
|
|
"completions/mean_length": 266.8125,
|
|
"completions/mean_terminated_length": 266.8125,
|
|
"completions/min_length": 174.0,
|
|
"completions/min_terminated_length": 174.0,
|
|
"epoch": 0.6352,
|
|
"grad_norm": 0.044015269726514816,
|
|
"learning_rate": 4.0569395017793596e-07,
|
|
"loss": -0.0169,
|
|
"num_tokens": 194911235.0,
|
|
"reward": 1.1511602401733398,
|
|
"reward_std": 0.0747460424900055,
|
|
"rewards/accuracy_reward_long_step": 0.27734375,
|
|
"rewards/final_brier_reward_long_step": 0.7376371622085571,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7576285004615784,
|
|
"step": 397
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 502.0,
|
|
"completions/max_terminated_length": 502.0,
|
|
"completions/mean_length": 258.0625,
|
|
"completions/mean_terminated_length": 258.0625,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.6368,
|
|
"grad_norm": 0.035533856600522995,
|
|
"learning_rate": 4.039145907473309e-07,
|
|
"loss": 0.0121,
|
|
"num_tokens": 195414379.0,
|
|
"reward": 1.45721435546875,
|
|
"reward_std": 0.18415778875350952,
|
|
"rewards/accuracy_reward_long_step": 0.56640625,
|
|
"rewards/final_brier_reward_long_step": 0.7342562675476074,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8289762735366821,
|
|
"step": 398
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 487.0,
|
|
"completions/max_terminated_length": 487.0,
|
|
"completions/mean_length": 260.1484375,
|
|
"completions/mean_terminated_length": 260.1484375,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 0.6384,
|
|
"grad_norm": 0.04100572690367699,
|
|
"learning_rate": 4.0213523131672593e-07,
|
|
"loss": 0.0142,
|
|
"num_tokens": 195921505.0,
|
|
"reward": 1.4241199493408203,
|
|
"reward_std": 0.12187729775905609,
|
|
"rewards/accuracy_reward_long_step": 0.53515625,
|
|
"rewards/final_brier_reward_long_step": 0.7385929822921753,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8172620534896851,
|
|
"step": 399
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 545.0,
|
|
"completions/max_terminated_length": 545.0,
|
|
"completions/mean_length": 260.5234375,
|
|
"completions/mean_terminated_length": 260.5234375,
|
|
"completions/min_length": 122.0,
|
|
"completions/min_terminated_length": 122.0,
|
|
"epoch": 0.64,
|
|
"grad_norm": 0.03168834373354912,
|
|
"learning_rate": 4.0035587188612095e-07,
|
|
"loss": 0.015,
|
|
"num_tokens": 196418567.0,
|
|
"reward": 1.468137264251709,
|
|
"reward_std": 0.12736788392066956,
|
|
"rewards/accuracy_reward_long_step": 0.546875,
|
|
"rewards/final_brier_reward_long_step": 0.796457052230835,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8885919451713562,
|
|
"step": 400
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 451.0,
|
|
"completions/max_terminated_length": 451.0,
|
|
"completions/mean_length": 259.34765625,
|
|
"completions/mean_terminated_length": 259.34765625,
|
|
"completions/min_length": 151.0,
|
|
"completions/min_terminated_length": 151.0,
|
|
"epoch": 0.6416,
|
|
"grad_norm": 0.07957521826028824,
|
|
"learning_rate": 3.98576512455516e-07,
|
|
"loss": 0.0156,
|
|
"num_tokens": 196899368.0,
|
|
"reward": 1.3013485670089722,
|
|
"reward_std": 0.1320360153913498,
|
|
"rewards/accuracy_reward_long_step": 0.41796875,
|
|
"rewards/final_brier_reward_long_step": 0.7306581735610962,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8028608560562134,
|
|
"step": 401
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 417.0,
|
|
"completions/max_terminated_length": 417.0,
|
|
"completions/mean_length": 258.40234375,
|
|
"completions/mean_terminated_length": 258.40234375,
|
|
"completions/min_length": 172.0,
|
|
"completions/min_terminated_length": 172.0,
|
|
"epoch": 0.6432,
|
|
"grad_norm": 0.04176841303706169,
|
|
"learning_rate": 3.9679715302491103e-07,
|
|
"loss": 0.0149,
|
|
"num_tokens": 197393599.0,
|
|
"reward": 1.4583325386047363,
|
|
"reward_std": 0.17372997105121613,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.7547035217285156,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8286267518997192,
|
|
"step": 402
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 430.0,
|
|
"completions/max_terminated_length": 430.0,
|
|
"completions/mean_length": 262.3828125,
|
|
"completions/mean_terminated_length": 262.3828125,
|
|
"completions/min_length": 111.0,
|
|
"completions/min_terminated_length": 111.0,
|
|
"epoch": 0.6448,
|
|
"grad_norm": 0.031080788001418114,
|
|
"learning_rate": 3.9501779359430604e-07,
|
|
"loss": -0.008,
|
|
"num_tokens": 197880809.0,
|
|
"reward": 1.3304669857025146,
|
|
"reward_std": 0.11997392773628235,
|
|
"rewards/accuracy_reward_long_step": 0.4453125,
|
|
"rewards/final_brier_reward_long_step": 0.7705242037773132,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7700934410095215,
|
|
"step": 403
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 539.0,
|
|
"completions/max_terminated_length": 539.0,
|
|
"completions/mean_length": 257.9296875,
|
|
"completions/mean_terminated_length": 257.9296875,
|
|
"completions/min_length": 106.0,
|
|
"completions/min_terminated_length": 106.0,
|
|
"epoch": 0.6464,
|
|
"grad_norm": 0.0383228063583374,
|
|
"learning_rate": 3.9323843416370106e-07,
|
|
"loss": 0.0168,
|
|
"num_tokens": 198364775.0,
|
|
"reward": 1.4343302249908447,
|
|
"reward_std": 0.13839168846607208,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.7040433883666992,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7832778692245483,
|
|
"step": 404
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 505.0,
|
|
"completions/max_terminated_length": 505.0,
|
|
"completions/mean_length": 245.3828125,
|
|
"completions/mean_terminated_length": 245.3828125,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.648,
|
|
"grad_norm": 0.04407874867320061,
|
|
"learning_rate": 3.9145907473309607e-07,
|
|
"loss": -0.0001,
|
|
"num_tokens": 198837905.0,
|
|
"reward": 1.4450054168701172,
|
|
"reward_std": 0.15000846982002258,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.7329072952270508,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.765864372253418,
|
|
"step": 405
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 488.0,
|
|
"completions/max_terminated_length": 488.0,
|
|
"completions/mean_length": 248.64453125,
|
|
"completions/mean_terminated_length": 248.64453125,
|
|
"completions/min_length": 91.0,
|
|
"completions/min_terminated_length": 91.0,
|
|
"epoch": 0.6496,
|
|
"grad_norm": 0.035016220062971115,
|
|
"learning_rate": 3.896797153024911e-07,
|
|
"loss": -0.0174,
|
|
"num_tokens": 199319566.0,
|
|
"reward": 1.3514586687088013,
|
|
"reward_std": 0.09576141834259033,
|
|
"rewards/accuracy_reward_long_step": 0.47265625,
|
|
"rewards/final_brier_reward_long_step": 0.7985478639602661,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7166616916656494,
|
|
"step": 406
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 486.0,
|
|
"completions/max_terminated_length": 486.0,
|
|
"completions/mean_length": 270.6796875,
|
|
"completions/mean_terminated_length": 270.6796875,
|
|
"completions/min_length": 98.0,
|
|
"completions/min_terminated_length": 98.0,
|
|
"epoch": 0.6512,
|
|
"grad_norm": 0.03493834286928177,
|
|
"learning_rate": 3.879003558718861e-07,
|
|
"loss": 0.005,
|
|
"num_tokens": 199798012.0,
|
|
"reward": 1.2670437097549438,
|
|
"reward_std": 0.1575869619846344,
|
|
"rewards/accuracy_reward_long_step": 0.375,
|
|
"rewards/final_brier_reward_long_step": 0.7614452242851257,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8067296743392944,
|
|
"step": 407
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 603.0,
|
|
"completions/max_terminated_length": 603.0,
|
|
"completions/mean_length": 257.234375,
|
|
"completions/mean_terminated_length": 257.234375,
|
|
"completions/min_length": 123.0,
|
|
"completions/min_terminated_length": 123.0,
|
|
"epoch": 0.6528,
|
|
"grad_norm": 0.17604607343673706,
|
|
"learning_rate": 3.861209964412811e-07,
|
|
"loss": -0.0137,
|
|
"num_tokens": 200299744.0,
|
|
"reward": 1.2874679565429688,
|
|
"reward_std": 0.11275781691074371,
|
|
"rewards/accuracy_reward_long_step": 0.3984375,
|
|
"rewards/final_brier_reward_long_step": 0.7716461420059204,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7844761610031128,
|
|
"step": 408
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 490.0,
|
|
"completions/max_terminated_length": 490.0,
|
|
"completions/mean_length": 245.10546875,
|
|
"completions/mean_terminated_length": 245.10546875,
|
|
"completions/min_length": 118.0,
|
|
"completions/min_terminated_length": 118.0,
|
|
"epoch": 0.6544,
|
|
"grad_norm": 0.03474588319659233,
|
|
"learning_rate": 3.8434163701067613e-07,
|
|
"loss": 0.0102,
|
|
"num_tokens": 200789587.0,
|
|
"reward": 1.3615117073059082,
|
|
"reward_std": 0.13387925922870636,
|
|
"rewards/accuracy_reward_long_step": 0.51171875,
|
|
"rewards/final_brier_reward_long_step": 0.6371433734893799,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7620280385017395,
|
|
"step": 409
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 431.0,
|
|
"completions/max_terminated_length": 431.0,
|
|
"completions/mean_length": 238.515625,
|
|
"completions/mean_terminated_length": 238.515625,
|
|
"completions/min_length": 108.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.656,
|
|
"grad_norm": 0.036233462393283844,
|
|
"learning_rate": 3.8256227758007115e-07,
|
|
"loss": 0.0044,
|
|
"num_tokens": 201275391.0,
|
|
"reward": 1.3896780014038086,
|
|
"reward_std": 0.13047534227371216,
|
|
"rewards/accuracy_reward_long_step": 0.515625,
|
|
"rewards/final_brier_reward_long_step": 0.6849026679992676,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8113091588020325,
|
|
"step": 410
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 504.0,
|
|
"completions/max_terminated_length": 504.0,
|
|
"completions/mean_length": 241.09375,
|
|
"completions/mean_terminated_length": 241.09375,
|
|
"completions/min_length": 94.0,
|
|
"completions/min_terminated_length": 94.0,
|
|
"epoch": 0.6576,
|
|
"grad_norm": 0.03837813064455986,
|
|
"learning_rate": 3.8078291814946616e-07,
|
|
"loss": -0.0032,
|
|
"num_tokens": 201756703.0,
|
|
"reward": 1.3157711029052734,
|
|
"reward_std": 0.10341217368841171,
|
|
"rewards/accuracy_reward_long_step": 0.45703125,
|
|
"rewards/final_brier_reward_long_step": 0.7142456769943237,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7207139134407043,
|
|
"step": 411
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 591.0,
|
|
"completions/max_terminated_length": 591.0,
|
|
"completions/mean_length": 238.15234375,
|
|
"completions/mean_terminated_length": 238.15234375,
|
|
"completions/min_length": 89.0,
|
|
"completions/min_terminated_length": 89.0,
|
|
"epoch": 0.6592,
|
|
"grad_norm": 0.0354132242500782,
|
|
"learning_rate": 3.790035587188612e-07,
|
|
"loss": 0.0055,
|
|
"num_tokens": 202225950.0,
|
|
"reward": 1.4732050895690918,
|
|
"reward_std": 0.1469092220067978,
|
|
"rewards/accuracy_reward_long_step": 0.58984375,
|
|
"rewards/final_brier_reward_long_step": 0.7194160223007202,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8218415975570679,
|
|
"step": 412
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 491.0,
|
|
"completions/max_terminated_length": 491.0,
|
|
"completions/mean_length": 247.93359375,
|
|
"completions/mean_terminated_length": 247.93359375,
|
|
"completions/min_length": 84.0,
|
|
"completions/min_terminated_length": 84.0,
|
|
"epoch": 0.6608,
|
|
"grad_norm": 0.046107884496450424,
|
|
"learning_rate": 3.7722419928825624e-07,
|
|
"loss": -0.0093,
|
|
"num_tokens": 202709477.0,
|
|
"reward": 1.3391033411026,
|
|
"reward_std": 0.08157768845558167,
|
|
"rewards/accuracy_reward_long_step": 0.46484375,
|
|
"rewards/final_brier_reward_long_step": 0.6804527044296265,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8165856599807739,
|
|
"step": 413
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 441.0,
|
|
"completions/max_terminated_length": 441.0,
|
|
"completions/mean_length": 225.625,
|
|
"completions/mean_terminated_length": 225.625,
|
|
"completions/min_length": 89.0,
|
|
"completions/min_terminated_length": 89.0,
|
|
"epoch": 0.6624,
|
|
"grad_norm": 0.04497173801064491,
|
|
"learning_rate": 3.7544483985765126e-07,
|
|
"loss": 0.0046,
|
|
"num_tokens": 203200421.0,
|
|
"reward": 1.4665465354919434,
|
|
"reward_std": 0.10675959289073944,
|
|
"rewards/accuracy_reward_long_step": 0.6015625,
|
|
"rewards/final_brier_reward_long_step": 0.7531781196594238,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7067579030990601,
|
|
"step": 414
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 511.0,
|
|
"completions/max_terminated_length": 511.0,
|
|
"completions/mean_length": 254.66015625,
|
|
"completions/mean_terminated_length": 254.66015625,
|
|
"completions/min_length": 91.0,
|
|
"completions/min_terminated_length": 91.0,
|
|
"epoch": 0.664,
|
|
"grad_norm": 0.037333909422159195,
|
|
"learning_rate": 3.7366548042704627e-07,
|
|
"loss": -0.005,
|
|
"num_tokens": 203701798.0,
|
|
"reward": 1.374776840209961,
|
|
"reward_std": 0.12094822525978088,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.7455586194992065,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.753548264503479,
|
|
"step": 415
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 482.0,
|
|
"completions/max_terminated_length": 482.0,
|
|
"completions/mean_length": 226.84375,
|
|
"completions/mean_terminated_length": 226.84375,
|
|
"completions/min_length": 98.0,
|
|
"completions/min_terminated_length": 98.0,
|
|
"epoch": 0.6656,
|
|
"grad_norm": 0.039919789880514145,
|
|
"learning_rate": 3.718861209964413e-07,
|
|
"loss": 0.025,
|
|
"num_tokens": 204175030.0,
|
|
"reward": 1.562011480331421,
|
|
"reward_std": 0.06771315634250641,
|
|
"rewards/accuracy_reward_long_step": 0.64453125,
|
|
"rewards/final_brier_reward_long_step": 0.8121625185012817,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8577582836151123,
|
|
"step": 416
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 409.0,
|
|
"completions/max_terminated_length": 409.0,
|
|
"completions/mean_length": 243.203125,
|
|
"completions/mean_terminated_length": 243.203125,
|
|
"completions/min_length": 112.0,
|
|
"completions/min_terminated_length": 112.0,
|
|
"epoch": 0.6672,
|
|
"grad_norm": 0.04114522784948349,
|
|
"learning_rate": 3.7010676156583625e-07,
|
|
"loss": 0.0153,
|
|
"num_tokens": 204655778.0,
|
|
"reward": 1.4633723497390747,
|
|
"reward_std": 0.0920601338148117,
|
|
"rewards/accuracy_reward_long_step": 0.54296875,
|
|
"rewards/final_brier_reward_long_step": 0.8287187218666077,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8528955578804016,
|
|
"step": 417
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 606.0,
|
|
"completions/max_terminated_length": 606.0,
|
|
"completions/mean_length": 244.66015625,
|
|
"completions/mean_terminated_length": 244.66015625,
|
|
"completions/min_length": 74.0,
|
|
"completions/min_terminated_length": 74.0,
|
|
"epoch": 0.6688,
|
|
"grad_norm": 0.0328848697245121,
|
|
"learning_rate": 3.6832740213523126e-07,
|
|
"loss": 0.005,
|
|
"num_tokens": 205149811.0,
|
|
"reward": 1.3774842023849487,
|
|
"reward_std": 0.13209398090839386,
|
|
"rewards/accuracy_reward_long_step": 0.48828125,
|
|
"rewards/final_brier_reward_long_step": 0.805209755897522,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.751602292060852,
|
|
"step": 418
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 447.0,
|
|
"completions/max_terminated_length": 447.0,
|
|
"completions/mean_length": 235.80859375,
|
|
"completions/mean_terminated_length": 235.80859375,
|
|
"completions/min_length": 95.0,
|
|
"completions/min_terminated_length": 95.0,
|
|
"epoch": 0.6704,
|
|
"grad_norm": 0.04273487254977226,
|
|
"learning_rate": 3.6654804270462633e-07,
|
|
"loss": 0.0065,
|
|
"num_tokens": 205628714.0,
|
|
"reward": 1.306333303451538,
|
|
"reward_std": 0.1178286001086235,
|
|
"rewards/accuracy_reward_long_step": 0.41796875,
|
|
"rewards/final_brier_reward_long_step": 0.7348085641860962,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8186495304107666,
|
|
"step": 419
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 581.0,
|
|
"completions/max_terminated_length": 581.0,
|
|
"completions/mean_length": 244.96484375,
|
|
"completions/mean_terminated_length": 244.96484375,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.672,
|
|
"grad_norm": 0.04026668146252632,
|
|
"learning_rate": 3.6476868327402134e-07,
|
|
"loss": 0.0007,
|
|
"num_tokens": 206110425.0,
|
|
"reward": 1.3591866493225098,
|
|
"reward_std": 0.13395720720291138,
|
|
"rewards/accuracy_reward_long_step": 0.47265625,
|
|
"rewards/final_brier_reward_long_step": 0.7895034551620483,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7566181421279907,
|
|
"step": 420
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 590.0,
|
|
"completions/max_terminated_length": 590.0,
|
|
"completions/mean_length": 244.67578125,
|
|
"completions/mean_terminated_length": 244.67578125,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.6736,
|
|
"grad_norm": 0.040121398866176605,
|
|
"learning_rate": 3.6298932384341636e-07,
|
|
"loss": -0.0165,
|
|
"num_tokens": 206586750.0,
|
|
"reward": 1.3462982177734375,
|
|
"reward_std": 0.140256866812706,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.6518319845199585,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8271111249923706,
|
|
"step": 421
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 382.0,
|
|
"completions/max_terminated_length": 382.0,
|
|
"completions/mean_length": 234.125,
|
|
"completions/mean_terminated_length": 234.125,
|
|
"completions/min_length": 86.0,
|
|
"completions/min_terminated_length": 86.0,
|
|
"epoch": 0.6752,
|
|
"grad_norm": 0.04478037729859352,
|
|
"learning_rate": 3.6120996441281137e-07,
|
|
"loss": 0.0001,
|
|
"num_tokens": 207062502.0,
|
|
"reward": 1.5341248512268066,
|
|
"reward_std": 0.12208271771669388,
|
|
"rewards/accuracy_reward_long_step": 0.63671875,
|
|
"rewards/final_brier_reward_long_step": 0.7614644765853882,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8281601071357727,
|
|
"step": 422
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 460.0,
|
|
"completions/max_terminated_length": 460.0,
|
|
"completions/mean_length": 246.85546875,
|
|
"completions/mean_terminated_length": 246.85546875,
|
|
"completions/min_length": 91.0,
|
|
"completions/min_terminated_length": 91.0,
|
|
"epoch": 0.6768,
|
|
"grad_norm": 0.041672661900520325,
|
|
"learning_rate": 3.594306049822064e-07,
|
|
"loss": 0.0102,
|
|
"num_tokens": 207558433.0,
|
|
"reward": 1.2478289604187012,
|
|
"reward_std": 0.09173645079135895,
|
|
"rewards/accuracy_reward_long_step": 0.34375,
|
|
"rewards/final_brier_reward_long_step": 0.8001093864440918,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8162060976028442,
|
|
"step": 423
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 406.0,
|
|
"completions/max_terminated_length": 406.0,
|
|
"completions/mean_length": 235.1484375,
|
|
"completions/mean_terminated_length": 235.1484375,
|
|
"completions/min_length": 92.0,
|
|
"completions/min_terminated_length": 92.0,
|
|
"epoch": 0.6784,
|
|
"grad_norm": 0.04561910033226013,
|
|
"learning_rate": 3.576512455516014e-07,
|
|
"loss": -0.0036,
|
|
"num_tokens": 208031183.0,
|
|
"reward": 1.43558931350708,
|
|
"reward_std": 0.09829960763454437,
|
|
"rewards/accuracy_reward_long_step": 0.51171875,
|
|
"rewards/final_brier_reward_long_step": 0.8206312656402588,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8748506903648376,
|
|
"step": 424
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 493.0,
|
|
"completions/max_terminated_length": 493.0,
|
|
"completions/mean_length": 252.4140625,
|
|
"completions/mean_terminated_length": 252.4140625,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.68,
|
|
"grad_norm": 0.039236631244421005,
|
|
"learning_rate": 3.5587188612099647e-07,
|
|
"loss": -0.0036,
|
|
"num_tokens": 208527737.0,
|
|
"reward": 1.433516263961792,
|
|
"reward_std": 0.17410725355148315,
|
|
"rewards/accuracy_reward_long_step": 0.53515625,
|
|
"rewards/final_brier_reward_long_step": 0.7744226455688477,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8268297910690308,
|
|
"step": 425
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 464.0,
|
|
"completions/max_terminated_length": 464.0,
|
|
"completions/mean_length": 223.62890625,
|
|
"completions/mean_terminated_length": 223.62890625,
|
|
"completions/min_length": 86.0,
|
|
"completions/min_terminated_length": 86.0,
|
|
"epoch": 0.6816,
|
|
"grad_norm": 0.034077707678079605,
|
|
"learning_rate": 3.540925266903915e-07,
|
|
"loss": -0.0108,
|
|
"num_tokens": 209000154.0,
|
|
"reward": 1.3932843208312988,
|
|
"reward_std": 0.08521192520856857,
|
|
"rewards/accuracy_reward_long_step": 0.453125,
|
|
"rewards/final_brier_reward_long_step": 0.9108027219772339,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8498345017433167,
|
|
"step": 426
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 882.0,
|
|
"completions/max_terminated_length": 882.0,
|
|
"completions/mean_length": 240.484375,
|
|
"completions/mean_terminated_length": 240.484375,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.6832,
|
|
"grad_norm": 0.04128441587090492,
|
|
"learning_rate": 3.5231316725978644e-07,
|
|
"loss": -0.0074,
|
|
"num_tokens": 209489150.0,
|
|
"reward": 1.565014123916626,
|
|
"reward_std": 0.15992087125778198,
|
|
"rewards/accuracy_reward_long_step": 0.671875,
|
|
"rewards/final_brier_reward_long_step": 0.7421960830688477,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8381730318069458,
|
|
"step": 427
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 471.0,
|
|
"completions/max_terminated_length": 471.0,
|
|
"completions/mean_length": 244.76953125,
|
|
"completions/mean_terminated_length": 244.76953125,
|
|
"completions/min_length": 97.0,
|
|
"completions/min_terminated_length": 97.0,
|
|
"epoch": 0.6848,
|
|
"grad_norm": 0.043866030871868134,
|
|
"learning_rate": 3.5053380782918146e-07,
|
|
"loss": -0.0145,
|
|
"num_tokens": 209985339.0,
|
|
"reward": 1.3970236778259277,
|
|
"reward_std": 0.16603073477745056,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.7696589827537537,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8262485861778259,
|
|
"step": 428
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 520.0,
|
|
"completions/max_terminated_length": 520.0,
|
|
"completions/mean_length": 234.33984375,
|
|
"completions/mean_terminated_length": 234.33984375,
|
|
"completions/min_length": 91.0,
|
|
"completions/min_terminated_length": 91.0,
|
|
"epoch": 0.6864,
|
|
"grad_norm": 0.042403049767017365,
|
|
"learning_rate": 3.4875444839857647e-07,
|
|
"loss": 0.0113,
|
|
"num_tokens": 210472786.0,
|
|
"reward": 1.5308232307434082,
|
|
"reward_std": 0.12255808711051941,
|
|
"rewards/accuracy_reward_long_step": 0.63671875,
|
|
"rewards/final_brier_reward_long_step": 0.7184535264968872,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8579643964767456,
|
|
"step": 429
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 550.0,
|
|
"completions/max_terminated_length": 550.0,
|
|
"completions/mean_length": 230.375,
|
|
"completions/mean_terminated_length": 230.375,
|
|
"completions/min_length": 76.0,
|
|
"completions/min_terminated_length": 76.0,
|
|
"epoch": 0.688,
|
|
"grad_norm": 0.040890295058488846,
|
|
"learning_rate": 3.469750889679715e-07,
|
|
"loss": 0.0102,
|
|
"num_tokens": 210951450.0,
|
|
"reward": 1.3568546772003174,
|
|
"reward_std": 0.20398034155368805,
|
|
"rewards/accuracy_reward_long_step": 0.47265625,
|
|
"rewards/final_brier_reward_long_step": 0.7493456602096558,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7952607274055481,
|
|
"step": 430
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 497.0,
|
|
"completions/max_terminated_length": 497.0,
|
|
"completions/mean_length": 240.0,
|
|
"completions/mean_terminated_length": 240.0,
|
|
"completions/min_length": 87.0,
|
|
"completions/min_terminated_length": 87.0,
|
|
"epoch": 0.6896,
|
|
"grad_norm": 0.03938218578696251,
|
|
"learning_rate": 3.4519572953736656e-07,
|
|
"loss": 0.0071,
|
|
"num_tokens": 211431050.0,
|
|
"reward": 1.369812250137329,
|
|
"reward_std": 0.16209545731544495,
|
|
"rewards/accuracy_reward_long_step": 0.4921875,
|
|
"rewards/final_brier_reward_long_step": 0.7736667990684509,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7368321418762207,
|
|
"step": 431
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 464.0,
|
|
"completions/max_terminated_length": 464.0,
|
|
"completions/mean_length": 238.2109375,
|
|
"completions/mean_terminated_length": 238.2109375,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.6912,
|
|
"grad_norm": 0.04615631699562073,
|
|
"learning_rate": 3.4341637010676157e-07,
|
|
"loss": 0.0087,
|
|
"num_tokens": 211919552.0,
|
|
"reward": 1.415741205215454,
|
|
"reward_std": 0.0865730568766594,
|
|
"rewards/accuracy_reward_long_step": 0.5078125,
|
|
"rewards/final_brier_reward_long_step": 0.7965086102485657,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8352065682411194,
|
|
"step": 432
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 500.0,
|
|
"completions/max_terminated_length": 500.0,
|
|
"completions/mean_length": 230.30078125,
|
|
"completions/mean_terminated_length": 230.30078125,
|
|
"completions/min_length": 68.0,
|
|
"completions/min_terminated_length": 68.0,
|
|
"epoch": 0.6928,
|
|
"grad_norm": 0.045928288251161575,
|
|
"learning_rate": 3.416370106761566e-07,
|
|
"loss": 0.0084,
|
|
"num_tokens": 212409581.0,
|
|
"reward": 1.4080349206924438,
|
|
"reward_std": 0.10854038596153259,
|
|
"rewards/accuracy_reward_long_step": 0.49609375,
|
|
"rewards/final_brier_reward_long_step": 0.8253128528594971,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8224518895149231,
|
|
"step": 433
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 513.0,
|
|
"completions/max_terminated_length": 513.0,
|
|
"completions/mean_length": 243.75390625,
|
|
"completions/mean_terminated_length": 243.75390625,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.6944,
|
|
"grad_norm": 0.036884017288684845,
|
|
"learning_rate": 3.398576512455516e-07,
|
|
"loss": -0.01,
|
|
"num_tokens": 212874262.0,
|
|
"reward": 1.3945372104644775,
|
|
"reward_std": 0.1757126748561859,
|
|
"rewards/accuracy_reward_long_step": 0.50390625,
|
|
"rewards/final_brier_reward_long_step": 0.8027616143226624,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7597622275352478,
|
|
"step": 434
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 457.0,
|
|
"completions/max_terminated_length": 457.0,
|
|
"completions/mean_length": 226.0,
|
|
"completions/mean_terminated_length": 226.0,
|
|
"completions/min_length": 66.0,
|
|
"completions/min_terminated_length": 66.0,
|
|
"epoch": 0.696,
|
|
"grad_norm": 0.03966812789440155,
|
|
"learning_rate": 3.380782918149466e-07,
|
|
"loss": 0.0137,
|
|
"num_tokens": 213355310.0,
|
|
"reward": 1.436761736869812,
|
|
"reward_std": 0.10320307314395905,
|
|
"rewards/accuracy_reward_long_step": 0.55078125,
|
|
"rewards/final_brier_reward_long_step": 0.7525194883346558,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7914024591445923,
|
|
"step": 435
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 600.0,
|
|
"completions/max_terminated_length": 600.0,
|
|
"completions/mean_length": 240.4921875,
|
|
"completions/mean_terminated_length": 240.4921875,
|
|
"completions/min_length": 91.0,
|
|
"completions/min_terminated_length": 91.0,
|
|
"epoch": 0.6976,
|
|
"grad_norm": 0.04616158828139305,
|
|
"learning_rate": 3.3629893238434163e-07,
|
|
"loss": -0.0162,
|
|
"num_tokens": 213835492.0,
|
|
"reward": 1.345383882522583,
|
|
"reward_std": 0.14034321904182434,
|
|
"rewards/accuracy_reward_long_step": 0.4453125,
|
|
"rewards/final_brier_reward_long_step": 0.783481240272522,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8168047666549683,
|
|
"step": 436
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 565.0,
|
|
"completions/max_terminated_length": 565.0,
|
|
"completions/mean_length": 225.7578125,
|
|
"completions/mean_terminated_length": 225.7578125,
|
|
"completions/min_length": 100.0,
|
|
"completions/min_terminated_length": 100.0,
|
|
"epoch": 0.6992,
|
|
"grad_norm": 0.04164176806807518,
|
|
"learning_rate": 3.3451957295373664e-07,
|
|
"loss": 0.0035,
|
|
"num_tokens": 214322966.0,
|
|
"reward": 1.3548498153686523,
|
|
"reward_std": 0.14276200532913208,
|
|
"rewards/accuracy_reward_long_step": 0.453125,
|
|
"rewards/final_brier_reward_long_step": 0.8077800273895264,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7991191744804382,
|
|
"step": 437
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 502.0,
|
|
"completions/max_terminated_length": 502.0,
|
|
"completions/mean_length": 226.65625,
|
|
"completions/mean_terminated_length": 226.65625,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.7008,
|
|
"grad_norm": 0.03757995367050171,
|
|
"learning_rate": 3.3274021352313166e-07,
|
|
"loss": 0.0176,
|
|
"num_tokens": 214794966.0,
|
|
"reward": 1.5220391750335693,
|
|
"reward_std": 0.1383122354745865,
|
|
"rewards/accuracy_reward_long_step": 0.62109375,
|
|
"rewards/final_brier_reward_long_step": 0.7596441507339478,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8441376686096191,
|
|
"step": 438
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 592.0,
|
|
"completions/max_terminated_length": 592.0,
|
|
"completions/mean_length": 237.671875,
|
|
"completions/mean_terminated_length": 237.671875,
|
|
"completions/min_length": 74.0,
|
|
"completions/min_terminated_length": 74.0,
|
|
"epoch": 0.7024,
|
|
"grad_norm": 0.04206470027565956,
|
|
"learning_rate": 3.3096085409252667e-07,
|
|
"loss": 0.0228,
|
|
"num_tokens": 215274386.0,
|
|
"reward": 1.4969501495361328,
|
|
"reward_std": 0.1166161373257637,
|
|
"rewards/accuracy_reward_long_step": 0.5859375,
|
|
"rewards/final_brier_reward_long_step": 0.8239851593971252,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.820065438747406,
|
|
"step": 439
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 520.0,
|
|
"completions/max_terminated_length": 520.0,
|
|
"completions/mean_length": 241.5390625,
|
|
"completions/mean_terminated_length": 241.5390625,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.704,
|
|
"grad_norm": 0.03900426998734474,
|
|
"learning_rate": 3.291814946619217e-07,
|
|
"loss": -0.0161,
|
|
"num_tokens": 215761204.0,
|
|
"reward": 1.4888627529144287,
|
|
"reward_std": 0.07008583098649979,
|
|
"rewards/accuracy_reward_long_step": 0.61328125,
|
|
"rewards/final_brier_reward_long_step": 0.7496683597564697,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7526575326919556,
|
|
"step": 440
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 416.0,
|
|
"completions/max_terminated_length": 416.0,
|
|
"completions/mean_length": 226.73828125,
|
|
"completions/mean_terminated_length": 226.73828125,
|
|
"completions/min_length": 95.0,
|
|
"completions/min_terminated_length": 95.0,
|
|
"epoch": 0.7056,
|
|
"grad_norm": 0.04039409011602402,
|
|
"learning_rate": 3.274021352313167e-07,
|
|
"loss": 0.0156,
|
|
"num_tokens": 216251281.0,
|
|
"reward": 1.4942941665649414,
|
|
"reward_std": 0.14357957243919373,
|
|
"rewards/accuracy_reward_long_step": 0.60546875,
|
|
"rewards/final_brier_reward_long_step": 0.7177531123161316,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8375486135482788,
|
|
"step": 441
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 486.0,
|
|
"completions/max_terminated_length": 486.0,
|
|
"completions/mean_length": 224.7421875,
|
|
"completions/mean_terminated_length": 225.62353515625,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 65.0,
|
|
"epoch": 0.7072,
|
|
"grad_norm": 0.03879899904131889,
|
|
"learning_rate": 3.256227758007117e-07,
|
|
"loss": -0.0177,
|
|
"num_tokens": 216745127.0,
|
|
"reward": 1.3038554191589355,
|
|
"reward_std": 0.1679355949163437,
|
|
"rewards/accuracy_reward_long_step": 0.421875,
|
|
"rewards/final_brier_reward_long_step": 0.7172562479972839,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8184775114059448,
|
|
"step": 442
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 528.0,
|
|
"completions/max_terminated_length": 528.0,
|
|
"completions/mean_length": 225.91796875,
|
|
"completions/mean_terminated_length": 225.91796875,
|
|
"completions/min_length": 92.0,
|
|
"completions/min_terminated_length": 92.0,
|
|
"epoch": 0.7088,
|
|
"grad_norm": 0.04107962176203728,
|
|
"learning_rate": 3.238434163701068e-07,
|
|
"loss": 0.0022,
|
|
"num_tokens": 217241754.0,
|
|
"reward": 1.3336093425750732,
|
|
"reward_std": 0.11196212470531464,
|
|
"rewards/accuracy_reward_long_step": 0.42578125,
|
|
"rewards/final_brier_reward_long_step": 0.7749031186103821,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.85640949010849,
|
|
"step": 443
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 585.0,
|
|
"completions/max_terminated_length": 585.0,
|
|
"completions/mean_length": 226.2890625,
|
|
"completions/mean_terminated_length": 226.2890625,
|
|
"completions/min_length": 95.0,
|
|
"completions/min_terminated_length": 95.0,
|
|
"epoch": 0.7104,
|
|
"grad_norm": 0.04981570690870285,
|
|
"learning_rate": 3.220640569395018e-07,
|
|
"loss": -0.0113,
|
|
"num_tokens": 217723420.0,
|
|
"reward": 1.4336767196655273,
|
|
"reward_std": 0.12888742983341217,
|
|
"rewards/accuracy_reward_long_step": 0.56640625,
|
|
"rewards/final_brier_reward_long_step": 0.7152671813964844,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7538148760795593,
|
|
"step": 444
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 520.0,
|
|
"completions/max_terminated_length": 520.0,
|
|
"completions/mean_length": 220.1328125,
|
|
"completions/mean_terminated_length": 220.1328125,
|
|
"completions/min_length": 79.0,
|
|
"completions/min_terminated_length": 79.0,
|
|
"epoch": 0.712,
|
|
"grad_norm": 0.043584324419498444,
|
|
"learning_rate": 3.202846975088968e-07,
|
|
"loss": 0.0119,
|
|
"num_tokens": 218212494.0,
|
|
"reward": 1.4643619060516357,
|
|
"reward_std": 0.12725988030433655,
|
|
"rewards/accuracy_reward_long_step": 0.56640625,
|
|
"rewards/final_brier_reward_long_step": 0.7841925621032715,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8076297640800476,
|
|
"step": 445
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 395.0,
|
|
"completions/max_terminated_length": 395.0,
|
|
"completions/mean_length": 230.3515625,
|
|
"completions/mean_terminated_length": 230.3515625,
|
|
"completions/min_length": 66.0,
|
|
"completions/min_terminated_length": 66.0,
|
|
"epoch": 0.7136,
|
|
"grad_norm": 0.03610699251294136,
|
|
"learning_rate": 3.1850533807829177e-07,
|
|
"loss": -0.0169,
|
|
"num_tokens": 218694184.0,
|
|
"reward": 1.559215784072876,
|
|
"reward_std": 0.07399096339941025,
|
|
"rewards/accuracy_reward_long_step": 0.66015625,
|
|
"rewards/final_brier_reward_long_step": 0.790971040725708,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8052672147750854,
|
|
"step": 446
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 466.0,
|
|
"completions/max_terminated_length": 466.0,
|
|
"completions/mean_length": 217.97265625,
|
|
"completions/mean_terminated_length": 217.97265625,
|
|
"completions/min_length": 81.0,
|
|
"completions/min_terminated_length": 81.0,
|
|
"epoch": 0.7152,
|
|
"grad_norm": 0.0351036936044693,
|
|
"learning_rate": 3.167259786476868e-07,
|
|
"loss": -0.0161,
|
|
"num_tokens": 219155313.0,
|
|
"reward": 1.5995757579803467,
|
|
"reward_std": 0.08523780107498169,
|
|
"rewards/accuracy_reward_long_step": 0.7109375,
|
|
"rewards/final_brier_reward_long_step": 0.7231503129005432,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8314027786254883,
|
|
"step": 447
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 589.0,
|
|
"completions/max_terminated_length": 589.0,
|
|
"completions/mean_length": 248.65234375,
|
|
"completions/mean_terminated_length": 248.65234375,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.7168,
|
|
"grad_norm": 0.04068749397993088,
|
|
"learning_rate": 3.149466192170818e-07,
|
|
"loss": 0.0197,
|
|
"num_tokens": 219654384.0,
|
|
"reward": 1.4963853359222412,
|
|
"reward_std": 0.1300518661737442,
|
|
"rewards/accuracy_reward_long_step": 0.578125,
|
|
"rewards/final_brier_reward_long_step": 0.794231653213501,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8788096904754639,
|
|
"step": 448
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 595.0,
|
|
"completions/max_terminated_length": 595.0,
|
|
"completions/mean_length": 225.171875,
|
|
"completions/mean_terminated_length": 225.171875,
|
|
"completions/min_length": 72.0,
|
|
"completions/min_terminated_length": 72.0,
|
|
"epoch": 0.7184,
|
|
"grad_norm": 0.037970248609781265,
|
|
"learning_rate": 3.1316725978647687e-07,
|
|
"loss": 0.0184,
|
|
"num_tokens": 220128044.0,
|
|
"reward": 1.2982159852981567,
|
|
"reward_std": 0.18391726911067963,
|
|
"rewards/accuracy_reward_long_step": 0.421875,
|
|
"rewards/final_brier_reward_long_step": 0.7201941013336182,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7851696014404297,
|
|
"step": 449
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 462.0,
|
|
"completions/max_terminated_length": 462.0,
|
|
"completions/mean_length": 233.64453125,
|
|
"completions/mean_terminated_length": 233.64453125,
|
|
"completions/min_length": 79.0,
|
|
"completions/min_terminated_length": 79.0,
|
|
"epoch": 0.72,
|
|
"grad_norm": 0.040078479796648026,
|
|
"learning_rate": 3.113879003558719e-07,
|
|
"loss": -0.0011,
|
|
"num_tokens": 220588769.0,
|
|
"reward": 1.5124320983886719,
|
|
"reward_std": 0.10843676328659058,
|
|
"rewards/accuracy_reward_long_step": 0.6015625,
|
|
"rewards/final_brier_reward_long_step": 0.7919909954071045,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8593000173568726,
|
|
"step": 450
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 494.0,
|
|
"completions/max_terminated_length": 494.0,
|
|
"completions/mean_length": 217.15234375,
|
|
"completions/mean_terminated_length": 217.15234375,
|
|
"completions/min_length": 74.0,
|
|
"completions/min_terminated_length": 74.0,
|
|
"epoch": 0.7216,
|
|
"grad_norm": 0.03554774448275566,
|
|
"learning_rate": 3.096085409252669e-07,
|
|
"loss": 0.007,
|
|
"num_tokens": 221075272.0,
|
|
"reward": 1.4489461183547974,
|
|
"reward_std": 0.10056018829345703,
|
|
"rewards/accuracy_reward_long_step": 0.53125,
|
|
"rewards/final_brier_reward_long_step": 0.811571478843689,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.85921311378479,
|
|
"step": 451
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 508.0,
|
|
"completions/max_terminated_length": 508.0,
|
|
"completions/mean_length": 220.69140625,
|
|
"completions/mean_terminated_length": 220.69140625,
|
|
"completions/min_length": 88.0,
|
|
"completions/min_terminated_length": 88.0,
|
|
"epoch": 0.7232,
|
|
"grad_norm": 0.03730124980211258,
|
|
"learning_rate": 3.078291814946619e-07,
|
|
"loss": -0.0023,
|
|
"num_tokens": 221558689.0,
|
|
"reward": 1.3989410400390625,
|
|
"reward_std": 0.10487143695354462,
|
|
"rewards/accuracy_reward_long_step": 0.55078125,
|
|
"rewards/final_brier_reward_long_step": 0.6452429294586182,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7473963499069214,
|
|
"step": 452
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 518.0,
|
|
"completions/max_terminated_length": 518.0,
|
|
"completions/mean_length": 217.04296875,
|
|
"completions/mean_terminated_length": 217.04296875,
|
|
"completions/min_length": 88.0,
|
|
"completions/min_terminated_length": 88.0,
|
|
"epoch": 0.7248,
|
|
"grad_norm": 0.038142986595630646,
|
|
"learning_rate": 3.0604982206405693e-07,
|
|
"loss": -0.025,
|
|
"num_tokens": 222026756.0,
|
|
"reward": 1.5077660083770752,
|
|
"reward_std": 0.146861732006073,
|
|
"rewards/accuracy_reward_long_step": 0.625,
|
|
"rewards/final_brier_reward_long_step": 0.6923167705535889,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8387469053268433,
|
|
"step": 453
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 453.0,
|
|
"completions/max_terminated_length": 453.0,
|
|
"completions/mean_length": 219.828125,
|
|
"completions/mean_terminated_length": 219.828125,
|
|
"completions/min_length": 74.0,
|
|
"completions/min_terminated_length": 74.0,
|
|
"epoch": 0.7264,
|
|
"grad_norm": 0.039899833500385284,
|
|
"learning_rate": 3.0427046263345194e-07,
|
|
"loss": -0.0029,
|
|
"num_tokens": 222498648.0,
|
|
"reward": 1.468416452407837,
|
|
"reward_std": 0.08466193825006485,
|
|
"rewards/accuracy_reward_long_step": 0.54296875,
|
|
"rewards/final_brier_reward_long_step": 0.8156249523162842,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8861656188964844,
|
|
"step": 454
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 492.0,
|
|
"completions/max_terminated_length": 492.0,
|
|
"completions/mean_length": 226.6796875,
|
|
"completions/mean_terminated_length": 226.6796875,
|
|
"completions/min_length": 82.0,
|
|
"completions/min_terminated_length": 82.0,
|
|
"epoch": 0.728,
|
|
"grad_norm": 0.04437141865491867,
|
|
"learning_rate": 3.02491103202847e-07,
|
|
"loss": 0.009,
|
|
"num_tokens": 222986366.0,
|
|
"reward": 1.3412034511566162,
|
|
"reward_std": 0.17921078205108643,
|
|
"rewards/accuracy_reward_long_step": 0.44921875,
|
|
"rewards/final_brier_reward_long_step": 0.7214792966842651,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8464598655700684,
|
|
"step": 455
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 463.0,
|
|
"completions/max_terminated_length": 463.0,
|
|
"completions/mean_length": 231.34765625,
|
|
"completions/mean_terminated_length": 231.34765625,
|
|
"completions/min_length": 114.0,
|
|
"completions/min_terminated_length": 114.0,
|
|
"epoch": 0.7296,
|
|
"grad_norm": 0.03869875520467758,
|
|
"learning_rate": 3.0071174377224197e-07,
|
|
"loss": 0.0084,
|
|
"num_tokens": 223455831.0,
|
|
"reward": 1.5209224224090576,
|
|
"reward_std": 0.1157989650964737,
|
|
"rewards/accuracy_reward_long_step": 0.6328125,
|
|
"rewards/final_brier_reward_long_step": 0.7708175778388977,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7816216945648193,
|
|
"step": 456
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 430.0,
|
|
"completions/max_terminated_length": 430.0,
|
|
"completions/mean_length": 227.9765625,
|
|
"completions/mean_terminated_length": 227.9765625,
|
|
"completions/min_length": 98.0,
|
|
"completions/min_terminated_length": 98.0,
|
|
"epoch": 0.7312,
|
|
"grad_norm": 0.04939800873398781,
|
|
"learning_rate": 2.98932384341637e-07,
|
|
"loss": -0.0007,
|
|
"num_tokens": 223944633.0,
|
|
"reward": 1.2755203247070312,
|
|
"reward_std": 0.16737382113933563,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.694128155708313,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7517032623291016,
|
|
"step": 457
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 434.0,
|
|
"completions/max_terminated_length": 434.0,
|
|
"completions/mean_length": 209.98046875,
|
|
"completions/mean_terminated_length": 209.98046875,
|
|
"completions/min_length": 85.0,
|
|
"completions/min_terminated_length": 85.0,
|
|
"epoch": 0.7328,
|
|
"grad_norm": 0.0375138595700264,
|
|
"learning_rate": 2.97153024911032e-07,
|
|
"loss": 0.0098,
|
|
"num_tokens": 224409132.0,
|
|
"reward": 1.4265563488006592,
|
|
"reward_std": 0.17411382496356964,
|
|
"rewards/accuracy_reward_long_step": 0.53515625,
|
|
"rewards/final_brier_reward_long_step": 0.7519593834877014,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8214536309242249,
|
|
"step": 458
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 536.0,
|
|
"completions/max_terminated_length": 536.0,
|
|
"completions/mean_length": 220.8828125,
|
|
"completions/mean_terminated_length": 220.8828125,
|
|
"completions/min_length": 66.0,
|
|
"completions/min_terminated_length": 66.0,
|
|
"epoch": 0.7344,
|
|
"grad_norm": 0.03728632628917694,
|
|
"learning_rate": 2.95373665480427e-07,
|
|
"loss": -0.0191,
|
|
"num_tokens": 224884646.0,
|
|
"reward": 1.4635272026062012,
|
|
"reward_std": 0.08029723912477493,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.7463421821594238,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8577666282653809,
|
|
"step": 459
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 674.0,
|
|
"completions/max_terminated_length": 674.0,
|
|
"completions/mean_length": 234.73828125,
|
|
"completions/mean_terminated_length": 234.73828125,
|
|
"completions/min_length": 93.0,
|
|
"completions/min_terminated_length": 93.0,
|
|
"epoch": 0.736,
|
|
"grad_norm": 0.032615575939416885,
|
|
"learning_rate": 2.9359430604982203e-07,
|
|
"loss": 0.0027,
|
|
"num_tokens": 225385123.0,
|
|
"reward": 1.5013632774353027,
|
|
"reward_std": 0.09813569486141205,
|
|
"rewards/accuracy_reward_long_step": 0.58984375,
|
|
"rewards/final_brier_reward_long_step": 0.8148428201675415,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8312351703643799,
|
|
"step": 460
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 461.0,
|
|
"completions/max_terminated_length": 461.0,
|
|
"completions/mean_length": 231.265625,
|
|
"completions/mean_terminated_length": 231.265625,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.7376,
|
|
"grad_norm": 0.038810133934020996,
|
|
"learning_rate": 2.918149466192171e-07,
|
|
"loss": 0.0063,
|
|
"num_tokens": 225872791.0,
|
|
"reward": 1.4602744579315186,
|
|
"reward_std": 0.15246982872486115,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.8496265411376953,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8352211713790894,
|
|
"step": 461
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 461.0,
|
|
"completions/max_terminated_length": 461.0,
|
|
"completions/mean_length": 226.99609375,
|
|
"completions/mean_terminated_length": 226.99609375,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.7392,
|
|
"grad_norm": 0.037601787596940994,
|
|
"learning_rate": 2.900355871886121e-07,
|
|
"loss": -0.0017,
|
|
"num_tokens": 226377910.0,
|
|
"reward": 1.4851224422454834,
|
|
"reward_std": 0.12114303559064865,
|
|
"rewards/accuracy_reward_long_step": 0.6015625,
|
|
"rewards/final_brier_reward_long_step": 0.691071093082428,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8431686162948608,
|
|
"step": 462
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 422.0,
|
|
"completions/max_terminated_length": 422.0,
|
|
"completions/mean_length": 222.1015625,
|
|
"completions/mean_terminated_length": 222.1015625,
|
|
"completions/min_length": 99.0,
|
|
"completions/min_terminated_length": 99.0,
|
|
"epoch": 0.7408,
|
|
"grad_norm": 0.03438973426818848,
|
|
"learning_rate": 2.882562277580071e-07,
|
|
"loss": 0.0082,
|
|
"num_tokens": 226869616.0,
|
|
"reward": 1.5410441160202026,
|
|
"reward_std": 0.10235248506069183,
|
|
"rewards/accuracy_reward_long_step": 0.65625,
|
|
"rewards/final_brier_reward_long_step": 0.7193183302879333,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8198581337928772,
|
|
"step": 463
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 383.0,
|
|
"completions/max_terminated_length": 383.0,
|
|
"completions/mean_length": 220.09375,
|
|
"completions/mean_terminated_length": 220.09375,
|
|
"completions/min_length": 101.0,
|
|
"completions/min_terminated_length": 101.0,
|
|
"epoch": 0.7424,
|
|
"grad_norm": 0.055105436593294144,
|
|
"learning_rate": 2.8647686832740214e-07,
|
|
"loss": -0.0036,
|
|
"num_tokens": 227356184.0,
|
|
"reward": 1.388896107673645,
|
|
"reward_std": 0.10595919191837311,
|
|
"rewards/accuracy_reward_long_step": 0.5546875,
|
|
"rewards/final_brier_reward_long_step": 0.5826694965362549,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7541650533676147,
|
|
"step": 464
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 575.0,
|
|
"completions/max_terminated_length": 575.0,
|
|
"completions/mean_length": 223.9765625,
|
|
"completions/mean_terminated_length": 223.9765625,
|
|
"completions/min_length": 101.0,
|
|
"completions/min_terminated_length": 101.0,
|
|
"epoch": 0.744,
|
|
"grad_norm": 0.033698540180921555,
|
|
"learning_rate": 2.8469750889679715e-07,
|
|
"loss": 0.0016,
|
|
"num_tokens": 227845482.0,
|
|
"reward": 1.5462651252746582,
|
|
"reward_std": 0.10527972877025604,
|
|
"rewards/accuracy_reward_long_step": 0.62890625,
|
|
"rewards/final_brier_reward_long_step": 0.8236533403396606,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8457825183868408,
|
|
"step": 465
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 639.0,
|
|
"completions/max_terminated_length": 639.0,
|
|
"completions/mean_length": 229.96484375,
|
|
"completions/mean_terminated_length": 229.96484375,
|
|
"completions/min_length": 91.0,
|
|
"completions/min_terminated_length": 91.0,
|
|
"epoch": 0.7456,
|
|
"grad_norm": 0.05342903360724449,
|
|
"learning_rate": 2.829181494661921e-07,
|
|
"loss": 0.0089,
|
|
"num_tokens": 228346801.0,
|
|
"reward": 1.3685379028320312,
|
|
"reward_std": 0.1639987826347351,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.7799558639526367,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7723206281661987,
|
|
"step": 466
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 508.0,
|
|
"completions/max_terminated_length": 508.0,
|
|
"completions/mean_length": 230.171875,
|
|
"completions/mean_terminated_length": 230.171875,
|
|
"completions/min_length": 123.0,
|
|
"completions/min_terminated_length": 123.0,
|
|
"epoch": 0.7472,
|
|
"grad_norm": 0.037923168390989304,
|
|
"learning_rate": 2.811387900355872e-07,
|
|
"loss": -0.0086,
|
|
"num_tokens": 228834093.0,
|
|
"reward": 1.4359982013702393,
|
|
"reward_std": 0.12845033407211304,
|
|
"rewards/accuracy_reward_long_step": 0.53125,
|
|
"rewards/final_brier_reward_long_step": 0.7942116856575012,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.824781060218811,
|
|
"step": 467
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 505.0,
|
|
"completions/max_terminated_length": 505.0,
|
|
"completions/mean_length": 226.984375,
|
|
"completions/mean_terminated_length": 226.984375,
|
|
"completions/min_length": 139.0,
|
|
"completions/min_terminated_length": 139.0,
|
|
"epoch": 0.7488,
|
|
"grad_norm": 0.040297914296388626,
|
|
"learning_rate": 2.793594306049822e-07,
|
|
"loss": -0.012,
|
|
"num_tokens": 229311481.0,
|
|
"reward": 1.357725977897644,
|
|
"reward_std": 0.11814339458942413,
|
|
"rewards/accuracy_reward_long_step": 0.46484375,
|
|
"rewards/final_brier_reward_long_step": 0.7514722347259521,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.820056676864624,
|
|
"step": 468
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 468.0,
|
|
"completions/max_terminated_length": 468.0,
|
|
"completions/mean_length": 220.39453125,
|
|
"completions/mean_terminated_length": 220.39453125,
|
|
"completions/min_length": 96.0,
|
|
"completions/min_terminated_length": 96.0,
|
|
"epoch": 0.7504,
|
|
"grad_norm": 0.048747822642326355,
|
|
"learning_rate": 2.775800711743772e-07,
|
|
"loss": 0.0175,
|
|
"num_tokens": 229784622.0,
|
|
"reward": 1.3947033882141113,
|
|
"reward_std": 0.13809074461460114,
|
|
"rewards/accuracy_reward_long_step": 0.50390625,
|
|
"rewards/final_brier_reward_long_step": 0.7879499793052673,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.775239109992981,
|
|
"step": 469
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 457.0,
|
|
"completions/max_terminated_length": 457.0,
|
|
"completions/mean_length": 220.22265625,
|
|
"completions/mean_terminated_length": 220.22265625,
|
|
"completions/min_length": 96.0,
|
|
"completions/min_terminated_length": 96.0,
|
|
"epoch": 0.752,
|
|
"grad_norm": 0.05068999528884888,
|
|
"learning_rate": 2.758007117437722e-07,
|
|
"loss": -0.0182,
|
|
"num_tokens": 230259999.0,
|
|
"reward": 1.2983078956604004,
|
|
"reward_std": 0.10855422168970108,
|
|
"rewards/accuracy_reward_long_step": 0.42578125,
|
|
"rewards/final_brier_reward_long_step": 0.7253687381744385,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7647379040718079,
|
|
"step": 470
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 498.0,
|
|
"completions/max_terminated_length": 498.0,
|
|
"completions/mean_length": 228.24609375,
|
|
"completions/mean_terminated_length": 228.24609375,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.7536,
|
|
"grad_norm": 0.036536820232868195,
|
|
"learning_rate": 2.7402135231316724e-07,
|
|
"loss": -0.0054,
|
|
"num_tokens": 230736478.0,
|
|
"reward": 1.6196913719177246,
|
|
"reward_std": 0.10245537757873535,
|
|
"rewards/accuracy_reward_long_step": 0.7109375,
|
|
"rewards/final_brier_reward_long_step": 0.8031273484230042,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8318880796432495,
|
|
"step": 471
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 459.0,
|
|
"completions/max_terminated_length": 459.0,
|
|
"completions/mean_length": 230.26171875,
|
|
"completions/mean_terminated_length": 230.26171875,
|
|
"completions/min_length": 114.0,
|
|
"completions/min_terminated_length": 114.0,
|
|
"epoch": 0.7552,
|
|
"grad_norm": 0.040038324892520905,
|
|
"learning_rate": 2.7224199288256225e-07,
|
|
"loss": 0.0134,
|
|
"num_tokens": 231215745.0,
|
|
"reward": 1.253148078918457,
|
|
"reward_std": 0.10847177356481552,
|
|
"rewards/accuracy_reward_long_step": 0.3359375,
|
|
"rewards/final_brier_reward_long_step": 0.8157835602760315,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8530591130256653,
|
|
"step": 472
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 459.0,
|
|
"completions/max_terminated_length": 459.0,
|
|
"completions/mean_length": 228.953125,
|
|
"completions/mean_terminated_length": 228.953125,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.7568,
|
|
"grad_norm": 0.03876092657446861,
|
|
"learning_rate": 2.704626334519573e-07,
|
|
"loss": -0.002,
|
|
"num_tokens": 231688509.0,
|
|
"reward": 1.2908313274383545,
|
|
"reward_std": 0.11586640775203705,
|
|
"rewards/accuracy_reward_long_step": 0.38671875,
|
|
"rewards/final_brier_reward_long_step": 0.7823525667190552,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8340977430343628,
|
|
"step": 473
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 468.0,
|
|
"completions/max_terminated_length": 468.0,
|
|
"completions/mean_length": 224.69921875,
|
|
"completions/mean_terminated_length": 224.69921875,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.7584,
|
|
"grad_norm": 0.03896433115005493,
|
|
"learning_rate": 2.6868327402135234e-07,
|
|
"loss": 0.0022,
|
|
"num_tokens": 232176304.0,
|
|
"reward": 1.4751133918762207,
|
|
"reward_std": 0.09986962378025055,
|
|
"rewards/accuracy_reward_long_step": 0.56640625,
|
|
"rewards/final_brier_reward_long_step": 0.8110538721084595,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8237748742103577,
|
|
"step": 474
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 586.0,
|
|
"completions/max_terminated_length": 586.0,
|
|
"completions/mean_length": 241.28125,
|
|
"completions/mean_terminated_length": 241.28125,
|
|
"completions/min_length": 125.0,
|
|
"completions/min_terminated_length": 125.0,
|
|
"epoch": 0.76,
|
|
"grad_norm": 0.04798499867320061,
|
|
"learning_rate": 2.669039145907473e-07,
|
|
"loss": 0.0133,
|
|
"num_tokens": 232659224.0,
|
|
"reward": 1.4235737323760986,
|
|
"reward_std": 0.14305740594863892,
|
|
"rewards/accuracy_reward_long_step": 0.55859375,
|
|
"rewards/final_brier_reward_long_step": 0.6940581798553467,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7736741304397583,
|
|
"step": 475
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 580.0,
|
|
"completions/max_terminated_length": 580.0,
|
|
"completions/mean_length": 248.6875,
|
|
"completions/mean_terminated_length": 248.6875,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.7616,
|
|
"grad_norm": 0.0414130873978138,
|
|
"learning_rate": 2.651245551601423e-07,
|
|
"loss": 0.0079,
|
|
"num_tokens": 233153904.0,
|
|
"reward": 1.4499199390411377,
|
|
"reward_std": 0.13342790305614471,
|
|
"rewards/accuracy_reward_long_step": 0.53515625,
|
|
"rewards/final_brier_reward_long_step": 0.8312417268753052,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8278130292892456,
|
|
"step": 476
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 578.0,
|
|
"completions/max_terminated_length": 578.0,
|
|
"completions/mean_length": 244.85546875,
|
|
"completions/mean_terminated_length": 244.85546875,
|
|
"completions/min_length": 123.0,
|
|
"completions/min_terminated_length": 123.0,
|
|
"epoch": 0.7632,
|
|
"grad_norm": 0.03957832232117653,
|
|
"learning_rate": 2.6334519572953733e-07,
|
|
"loss": -0.0045,
|
|
"num_tokens": 233630819.0,
|
|
"reward": 1.3480762243270874,
|
|
"reward_std": 0.1540793478488922,
|
|
"rewards/accuracy_reward_long_step": 0.47265625,
|
|
"rewards/final_brier_reward_long_step": 0.7323105335235596,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7693694233894348,
|
|
"step": 477
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 429.0,
|
|
"completions/max_terminated_length": 429.0,
|
|
"completions/mean_length": 217.80078125,
|
|
"completions/mean_terminated_length": 217.80078125,
|
|
"completions/min_length": 78.0,
|
|
"completions/min_terminated_length": 78.0,
|
|
"epoch": 0.7648,
|
|
"grad_norm": 0.040704309940338135,
|
|
"learning_rate": 2.6156583629893234e-07,
|
|
"loss": 0.002,
|
|
"num_tokens": 234112184.0,
|
|
"reward": 1.4834253787994385,
|
|
"reward_std": 0.176324263215065,
|
|
"rewards/accuracy_reward_long_step": 0.58984375,
|
|
"rewards/final_brier_reward_long_step": 0.7984238266944885,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7759029865264893,
|
|
"step": 478
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 468.0,
|
|
"completions/max_terminated_length": 468.0,
|
|
"completions/mean_length": 225.7578125,
|
|
"completions/mean_terminated_length": 225.7578125,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.7664,
|
|
"grad_norm": 0.039327558130025864,
|
|
"learning_rate": 2.597864768683274e-07,
|
|
"loss": 0.0018,
|
|
"num_tokens": 234606322.0,
|
|
"reward": 1.312846064567566,
|
|
"reward_std": 0.09807312488555908,
|
|
"rewards/accuracy_reward_long_step": 0.41796875,
|
|
"rewards/final_brier_reward_long_step": 0.7961425185203552,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7833666205406189,
|
|
"step": 479
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 603.0,
|
|
"completions/max_terminated_length": 603.0,
|
|
"completions/mean_length": 224.43359375,
|
|
"completions/mean_terminated_length": 224.43359375,
|
|
"completions/min_length": 112.0,
|
|
"completions/min_terminated_length": 112.0,
|
|
"epoch": 0.768,
|
|
"grad_norm": 0.05755281448364258,
|
|
"learning_rate": 2.580071174377224e-07,
|
|
"loss": 0.0044,
|
|
"num_tokens": 235077329.0,
|
|
"reward": 1.3882198333740234,
|
|
"reward_std": 0.12557154893875122,
|
|
"rewards/accuracy_reward_long_step": 0.49609375,
|
|
"rewards/final_brier_reward_long_step": 0.7875798344612122,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.780924916267395,
|
|
"step": 480
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 630.0,
|
|
"completions/max_terminated_length": 630.0,
|
|
"completions/mean_length": 230.8125,
|
|
"completions/mean_terminated_length": 231.71766662597656,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.7696,
|
|
"grad_norm": 0.04049382731318474,
|
|
"learning_rate": 2.5622775800711744e-07,
|
|
"loss": -0.0087,
|
|
"num_tokens": 235557009.0,
|
|
"reward": 1.3864805698394775,
|
|
"reward_std": 0.1668914556503296,
|
|
"rewards/accuracy_reward_long_step": 0.49609375,
|
|
"rewards/final_brier_reward_long_step": 0.7072670459747314,
|
|
"rewards/format_reward_long_step": 0.98828125,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8777177929878235,
|
|
"step": 481
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 487.0,
|
|
"completions/max_terminated_length": 487.0,
|
|
"completions/mean_length": 232.35546875,
|
|
"completions/mean_terminated_length": 232.35546875,
|
|
"completions/min_length": 133.0,
|
|
"completions/min_terminated_length": 133.0,
|
|
"epoch": 0.7712,
|
|
"grad_norm": 0.04888352006673813,
|
|
"learning_rate": 2.5444839857651245e-07,
|
|
"loss": 0.0026,
|
|
"num_tokens": 236049124.0,
|
|
"reward": 1.5799849033355713,
|
|
"reward_std": 0.10926343500614166,
|
|
"rewards/accuracy_reward_long_step": 0.68359375,
|
|
"rewards/final_brier_reward_long_step": 0.763283908367157,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8222807049751282,
|
|
"step": 482
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 469.0,
|
|
"completions/max_terminated_length": 469.0,
|
|
"completions/mean_length": 233.8671875,
|
|
"completions/mean_terminated_length": 233.8671875,
|
|
"completions/min_length": 120.0,
|
|
"completions/min_terminated_length": 120.0,
|
|
"epoch": 0.7728,
|
|
"grad_norm": 0.035025861114263535,
|
|
"learning_rate": 2.5266903914590747e-07,
|
|
"loss": -0.0001,
|
|
"num_tokens": 236535578.0,
|
|
"reward": 1.404916524887085,
|
|
"reward_std": 0.10989418625831604,
|
|
"rewards/accuracy_reward_long_step": 0.48828125,
|
|
"rewards/final_brier_reward_long_step": 0.8273136615753174,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.839227557182312,
|
|
"step": 483
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 463.0,
|
|
"completions/max_terminated_length": 463.0,
|
|
"completions/mean_length": 229.078125,
|
|
"completions/mean_terminated_length": 229.078125,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.7744,
|
|
"grad_norm": 0.03963632136583328,
|
|
"learning_rate": 2.508896797153025e-07,
|
|
"loss": -0.0033,
|
|
"num_tokens": 237017478.0,
|
|
"reward": 1.5163967609405518,
|
|
"reward_std": 0.15413016080856323,
|
|
"rewards/accuracy_reward_long_step": 0.64453125,
|
|
"rewards/final_brier_reward_long_step": 0.6957645416259766,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7916977405548096,
|
|
"step": 484
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 445.0,
|
|
"completions/max_terminated_length": 445.0,
|
|
"completions/mean_length": 233.1484375,
|
|
"completions/mean_terminated_length": 233.1484375,
|
|
"completions/min_length": 140.0,
|
|
"completions/min_terminated_length": 140.0,
|
|
"epoch": 0.776,
|
|
"grad_norm": 0.0386706106364727,
|
|
"learning_rate": 2.491103202846975e-07,
|
|
"loss": 0.0037,
|
|
"num_tokens": 237493188.0,
|
|
"reward": 1.385927677154541,
|
|
"reward_std": 0.19464275240898132,
|
|
"rewards/accuracy_reward_long_step": 0.4921875,
|
|
"rewards/final_brier_reward_long_step": 0.7536214590072632,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8213395476341248,
|
|
"step": 485
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 522.0,
|
|
"completions/max_terminated_length": 522.0,
|
|
"completions/mean_length": 238.83984375,
|
|
"completions/mean_terminated_length": 238.83984375,
|
|
"completions/min_length": 130.0,
|
|
"completions/min_terminated_length": 130.0,
|
|
"epoch": 0.7776,
|
|
"grad_norm": 0.040338389575481415,
|
|
"learning_rate": 2.473309608540925e-07,
|
|
"loss": 0.0038,
|
|
"num_tokens": 237987779.0,
|
|
"reward": 1.5493258237838745,
|
|
"reward_std": 0.15063825249671936,
|
|
"rewards/accuracy_reward_long_step": 0.66015625,
|
|
"rewards/final_brier_reward_long_step": 0.7311683893203735,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8255099058151245,
|
|
"step": 486
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 467.0,
|
|
"completions/max_terminated_length": 467.0,
|
|
"completions/mean_length": 219.1171875,
|
|
"completions/mean_terminated_length": 219.1171875,
|
|
"completions/min_length": 138.0,
|
|
"completions/min_terminated_length": 138.0,
|
|
"epoch": 0.7792,
|
|
"grad_norm": 0.059350065886974335,
|
|
"learning_rate": 2.455516014234875e-07,
|
|
"loss": 0.0032,
|
|
"num_tokens": 238479841.0,
|
|
"reward": 1.5190156698226929,
|
|
"reward_std": 0.1180032342672348,
|
|
"rewards/accuracy_reward_long_step": 0.625,
|
|
"rewards/final_brier_reward_long_step": 0.7632279396057129,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8128350973129272,
|
|
"step": 487
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 595.0,
|
|
"completions/max_terminated_length": 595.0,
|
|
"completions/mean_length": 239.76953125,
|
|
"completions/mean_terminated_length": 239.76953125,
|
|
"completions/min_length": 92.0,
|
|
"completions/min_terminated_length": 92.0,
|
|
"epoch": 0.7808,
|
|
"grad_norm": 0.04275665804743767,
|
|
"learning_rate": 2.4377224199288254e-07,
|
|
"loss": 0.0087,
|
|
"num_tokens": 238974614.0,
|
|
"reward": 1.480884313583374,
|
|
"reward_std": 0.11418268084526062,
|
|
"rewards/accuracy_reward_long_step": 0.58984375,
|
|
"rewards/final_brier_reward_long_step": 0.7228184938430786,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8413438200950623,
|
|
"step": 488
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 570.0,
|
|
"completions/max_terminated_length": 570.0,
|
|
"completions/mean_length": 235.77734375,
|
|
"completions/mean_terminated_length": 235.77734375,
|
|
"completions/min_length": 143.0,
|
|
"completions/min_terminated_length": 143.0,
|
|
"epoch": 0.7824,
|
|
"grad_norm": 0.04414826259016991,
|
|
"learning_rate": 2.4199288256227755e-07,
|
|
"loss": -0.0019,
|
|
"num_tokens": 239466805.0,
|
|
"reward": 1.2463829517364502,
|
|
"reward_std": 0.14157749712467194,
|
|
"rewards/accuracy_reward_long_step": 0.3984375,
|
|
"rewards/final_brier_reward_long_step": 0.6166784763336182,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7751035094261169,
|
|
"step": 489
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 510.0,
|
|
"completions/max_terminated_length": 510.0,
|
|
"completions/mean_length": 246.86328125,
|
|
"completions/mean_terminated_length": 246.86328125,
|
|
"completions/min_length": 87.0,
|
|
"completions/min_terminated_length": 87.0,
|
|
"epoch": 0.784,
|
|
"grad_norm": 0.035570476204156876,
|
|
"learning_rate": 2.4021352313167257e-07,
|
|
"loss": 0.0011,
|
|
"num_tokens": 239961386.0,
|
|
"reward": 1.3018330335617065,
|
|
"reward_std": 0.10914282500743866,
|
|
"rewards/accuracy_reward_long_step": 0.40625,
|
|
"rewards/final_brier_reward_long_step": 0.7352542877197266,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8470777869224548,
|
|
"step": 490
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 553.0,
|
|
"completions/max_terminated_length": 553.0,
|
|
"completions/mean_length": 238.81640625,
|
|
"completions/mean_terminated_length": 238.81640625,
|
|
"completions/min_length": 133.0,
|
|
"completions/min_terminated_length": 133.0,
|
|
"epoch": 0.7856,
|
|
"grad_norm": 0.04079211875796318,
|
|
"learning_rate": 2.3843416370106764e-07,
|
|
"loss": 0.0092,
|
|
"num_tokens": 240441995.0,
|
|
"reward": 1.3198940753936768,
|
|
"reward_std": 0.1449519544839859,
|
|
"rewards/accuracy_reward_long_step": 0.45703125,
|
|
"rewards/final_brier_reward_long_step": 0.6919292211532593,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7595219612121582,
|
|
"step": 491
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 551.0,
|
|
"completions/max_terminated_length": 551.0,
|
|
"completions/mean_length": 239.6015625,
|
|
"completions/mean_terminated_length": 239.6015625,
|
|
"completions/min_length": 125.0,
|
|
"completions/min_terminated_length": 125.0,
|
|
"epoch": 0.7872,
|
|
"grad_norm": 0.051125992089509964,
|
|
"learning_rate": 2.366548042704626e-07,
|
|
"loss": 0.0046,
|
|
"num_tokens": 240913021.0,
|
|
"reward": 1.5111445188522339,
|
|
"reward_std": 0.1414456069469452,
|
|
"rewards/accuracy_reward_long_step": 0.609375,
|
|
"rewards/final_brier_reward_long_step": 0.7685543298721313,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.838523805141449,
|
|
"step": 492
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 408.0,
|
|
"completions/max_terminated_length": 408.0,
|
|
"completions/mean_length": 224.24609375,
|
|
"completions/mean_terminated_length": 224.24609375,
|
|
"completions/min_length": 118.0,
|
|
"completions/min_terminated_length": 118.0,
|
|
"epoch": 0.7888,
|
|
"grad_norm": 0.0370771661400795,
|
|
"learning_rate": 2.3487544483985764e-07,
|
|
"loss": 0.011,
|
|
"num_tokens": 241386988.0,
|
|
"reward": 1.56308913230896,
|
|
"reward_std": 0.10970332473516464,
|
|
"rewards/accuracy_reward_long_step": 0.66015625,
|
|
"rewards/final_brier_reward_long_step": 0.8099468946456909,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8017843961715698,
|
|
"step": 493
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 472.0,
|
|
"completions/max_terminated_length": 472.0,
|
|
"completions/mean_length": 240.046875,
|
|
"completions/mean_terminated_length": 240.046875,
|
|
"completions/min_length": 109.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 0.7904,
|
|
"grad_norm": 0.0435328409075737,
|
|
"learning_rate": 2.3309608540925265e-07,
|
|
"loss": 0.0084,
|
|
"num_tokens": 241883192.0,
|
|
"reward": 1.4550728797912598,
|
|
"reward_std": 0.056624144315719604,
|
|
"rewards/accuracy_reward_long_step": 0.54296875,
|
|
"rewards/final_brier_reward_long_step": 0.8555335998535156,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7928834557533264,
|
|
"step": 494
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 718.0,
|
|
"completions/max_terminated_length": 718.0,
|
|
"completions/mean_length": 237.8515625,
|
|
"completions/mean_terminated_length": 237.8515625,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.792,
|
|
"grad_norm": 0.03587677702307701,
|
|
"learning_rate": 2.313167259786477e-07,
|
|
"loss": 0.0089,
|
|
"num_tokens": 242375578.0,
|
|
"reward": 1.5103130340576172,
|
|
"reward_std": 0.10776931047439575,
|
|
"rewards/accuracy_reward_long_step": 0.59765625,
|
|
"rewards/final_brier_reward_long_step": 0.8439902067184448,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8066369295120239,
|
|
"step": 495
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 945.0,
|
|
"completions/max_terminated_length": 945.0,
|
|
"completions/mean_length": 239.59375,
|
|
"completions/mean_terminated_length": 239.59375,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.7936,
|
|
"grad_norm": 0.041137006133794785,
|
|
"learning_rate": 2.295373665480427e-07,
|
|
"loss": 0.0062,
|
|
"num_tokens": 242859330.0,
|
|
"reward": 1.4777976274490356,
|
|
"reward_std": 0.14235195517539978,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.8539682030677795,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7759724259376526,
|
|
"step": 496
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 544.0,
|
|
"completions/max_terminated_length": 544.0,
|
|
"completions/mean_length": 230.46484375,
|
|
"completions/mean_terminated_length": 230.46484375,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.7952,
|
|
"grad_norm": 0.03613395616412163,
|
|
"learning_rate": 2.277580071174377e-07,
|
|
"loss": -0.0055,
|
|
"num_tokens": 243345321.0,
|
|
"reward": 1.5040514469146729,
|
|
"reward_std": 0.1266600787639618,
|
|
"rewards/accuracy_reward_long_step": 0.60546875,
|
|
"rewards/final_brier_reward_long_step": 0.8192323446273804,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7750980257987976,
|
|
"step": 497
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 378.0,
|
|
"completions/max_terminated_length": 378.0,
|
|
"completions/mean_length": 226.73828125,
|
|
"completions/mean_terminated_length": 226.73828125,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 0.7968,
|
|
"grad_norm": 0.04258696362376213,
|
|
"learning_rate": 2.2597864768683274e-07,
|
|
"loss": -0.0045,
|
|
"num_tokens": 243827942.0,
|
|
"reward": 1.3869428634643555,
|
|
"reward_std": 0.1397380828857422,
|
|
"rewards/accuracy_reward_long_step": 0.4921875,
|
|
"rewards/final_brier_reward_long_step": 0.7650054693222046,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8140161037445068,
|
|
"step": 498
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 530.0,
|
|
"completions/max_terminated_length": 530.0,
|
|
"completions/mean_length": 240.43359375,
|
|
"completions/mean_terminated_length": 240.43359375,
|
|
"completions/min_length": 104.0,
|
|
"completions/min_terminated_length": 104.0,
|
|
"epoch": 0.7984,
|
|
"grad_norm": 0.03320414200425148,
|
|
"learning_rate": 2.2419928825622775e-07,
|
|
"loss": -0.0118,
|
|
"num_tokens": 244325933.0,
|
|
"reward": 1.4908723831176758,
|
|
"reward_std": 0.16621750593185425,
|
|
"rewards/accuracy_reward_long_step": 0.60546875,
|
|
"rewards/final_brier_reward_long_step": 0.7513167858123779,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7902982234954834,
|
|
"step": 499
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 454.0,
|
|
"completions/max_terminated_length": 454.0,
|
|
"completions/mean_length": 226.42578125,
|
|
"completions/mean_terminated_length": 226.42578125,
|
|
"completions/min_length": 123.0,
|
|
"completions/min_terminated_length": 123.0,
|
|
"epoch": 0.8,
|
|
"grad_norm": 0.03856438770890236,
|
|
"learning_rate": 2.2241992882562277e-07,
|
|
"loss": 0.0033,
|
|
"num_tokens": 244805666.0,
|
|
"reward": 1.5031077861785889,
|
|
"reward_std": 0.12228081375360489,
|
|
"rewards/accuracy_reward_long_step": 0.59375,
|
|
"rewards/final_brier_reward_long_step": 0.8210663795471191,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8163642883300781,
|
|
"step": 500
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 433.0,
|
|
"completions/max_terminated_length": 433.0,
|
|
"completions/mean_length": 220.3984375,
|
|
"completions/mean_terminated_length": 220.3984375,
|
|
"completions/min_length": 120.0,
|
|
"completions/min_terminated_length": 120.0,
|
|
"epoch": 0.8016,
|
|
"grad_norm": 0.0355200357735157,
|
|
"learning_rate": 2.206405693950178e-07,
|
|
"loss": -0.0003,
|
|
"num_tokens": 245298544.0,
|
|
"reward": 1.5138522386550903,
|
|
"reward_std": 0.15135133266448975,
|
|
"rewards/accuracy_reward_long_step": 0.578125,
|
|
"rewards/final_brier_reward_long_step": 0.8448459506034851,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.9058756828308105,
|
|
"step": 501
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 425.0,
|
|
"completions/max_terminated_length": 425.0,
|
|
"completions/mean_length": 237.34765625,
|
|
"completions/mean_terminated_length": 237.34765625,
|
|
"completions/min_length": 130.0,
|
|
"completions/min_terminated_length": 130.0,
|
|
"epoch": 0.8032,
|
|
"grad_norm": 0.037190381437540054,
|
|
"learning_rate": 2.188612099644128e-07,
|
|
"loss": 0.0034,
|
|
"num_tokens": 245784137.0,
|
|
"reward": 1.5037150382995605,
|
|
"reward_std": 0.15791866183280945,
|
|
"rewards/accuracy_reward_long_step": 0.609375,
|
|
"rewards/final_brier_reward_long_step": 0.7559190988540649,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8214409947395325,
|
|
"step": 502
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 555.0,
|
|
"completions/max_terminated_length": 555.0,
|
|
"completions/mean_length": 224.734375,
|
|
"completions/mean_terminated_length": 224.734375,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.8048,
|
|
"grad_norm": 0.0395224392414093,
|
|
"learning_rate": 2.170818505338078e-07,
|
|
"loss": 0.0042,
|
|
"num_tokens": 246249421.0,
|
|
"reward": 1.5526416301727295,
|
|
"reward_std": 0.11045798659324646,
|
|
"rewards/accuracy_reward_long_step": 0.6484375,
|
|
"rewards/final_brier_reward_long_step": 0.818356990814209,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7984594702720642,
|
|
"step": 503
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 514.0,
|
|
"completions/max_terminated_length": 514.0,
|
|
"completions/mean_length": 236.96484375,
|
|
"completions/mean_terminated_length": 236.96484375,
|
|
"completions/min_length": 135.0,
|
|
"completions/min_terminated_length": 135.0,
|
|
"epoch": 0.8064,
|
|
"grad_norm": 0.03467653691768646,
|
|
"learning_rate": 2.1530249110320285e-07,
|
|
"loss": 0.0056,
|
|
"num_tokens": 246741180.0,
|
|
"reward": 1.4294129610061646,
|
|
"reward_std": 0.15054050087928772,
|
|
"rewards/accuracy_reward_long_step": 0.55859375,
|
|
"rewards/final_brier_reward_long_step": 0.730989396572113,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7522871494293213,
|
|
"step": 504
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 491.0,
|
|
"completions/max_terminated_length": 491.0,
|
|
"completions/mean_length": 244.40234375,
|
|
"completions/mean_terminated_length": 244.40234375,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.808,
|
|
"grad_norm": 0.03365206718444824,
|
|
"learning_rate": 2.1352313167259786e-07,
|
|
"loss": -0.0079,
|
|
"num_tokens": 247235891.0,
|
|
"reward": 1.3486175537109375,
|
|
"reward_std": 0.10975901782512665,
|
|
"rewards/accuracy_reward_long_step": 0.44140625,
|
|
"rewards/final_brier_reward_long_step": 0.8065632581710815,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8222817182540894,
|
|
"step": 505
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 522.0,
|
|
"completions/max_terminated_length": 522.0,
|
|
"completions/mean_length": 230.9921875,
|
|
"completions/mean_terminated_length": 230.9921875,
|
|
"completions/min_length": 125.0,
|
|
"completions/min_terminated_length": 125.0,
|
|
"epoch": 0.8096,
|
|
"grad_norm": 0.041645560413599014,
|
|
"learning_rate": 2.1174377224199288e-07,
|
|
"loss": 0.0051,
|
|
"num_tokens": 247714529.0,
|
|
"reward": 1.614863395690918,
|
|
"reward_std": 0.1503724455833435,
|
|
"rewards/accuracy_reward_long_step": 0.703125,
|
|
"rewards/final_brier_reward_long_step": 0.7402952909469604,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.9066582918167114,
|
|
"step": 506
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 606.0,
|
|
"completions/max_terminated_length": 606.0,
|
|
"completions/mean_length": 236.5625,
|
|
"completions/mean_terminated_length": 236.5625,
|
|
"completions/min_length": 133.0,
|
|
"completions/min_terminated_length": 133.0,
|
|
"epoch": 0.8112,
|
|
"grad_norm": 0.045049868524074554,
|
|
"learning_rate": 2.099644128113879e-07,
|
|
"loss": -0.0034,
|
|
"num_tokens": 248209161.0,
|
|
"reward": 1.312518835067749,
|
|
"reward_std": 0.1855650395154953,
|
|
"rewards/accuracy_reward_long_step": 0.44140625,
|
|
"rewards/final_brier_reward_long_step": 0.7521023154258728,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7323479056358337,
|
|
"step": 507
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 455.0,
|
|
"completions/max_terminated_length": 455.0,
|
|
"completions/mean_length": 229.140625,
|
|
"completions/mean_terminated_length": 229.140625,
|
|
"completions/min_length": 116.0,
|
|
"completions/min_terminated_length": 116.0,
|
|
"epoch": 0.8128,
|
|
"grad_norm": 0.047168269753456116,
|
|
"learning_rate": 2.081850533807829e-07,
|
|
"loss": 0.0165,
|
|
"num_tokens": 248688005.0,
|
|
"reward": 1.4646296501159668,
|
|
"reward_std": 0.09372396022081375,
|
|
"rewards/accuracy_reward_long_step": 0.57421875,
|
|
"rewards/final_brier_reward_long_step": 0.774226188659668,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7874171733856201,
|
|
"step": 508
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 486.0,
|
|
"completions/max_terminated_length": 486.0,
|
|
"completions/mean_length": 234.80078125,
|
|
"completions/mean_terminated_length": 234.80078125,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 0.8144,
|
|
"grad_norm": 0.04063938930630684,
|
|
"learning_rate": 2.0640569395017792e-07,
|
|
"loss": -0.015,
|
|
"num_tokens": 249176586.0,
|
|
"reward": 1.3976056575775146,
|
|
"reward_std": 0.1380692720413208,
|
|
"rewards/accuracy_reward_long_step": 0.546875,
|
|
"rewards/final_brier_reward_long_step": 0.6868070363998413,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7161159515380859,
|
|
"step": 509
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 457.0,
|
|
"completions/max_terminated_length": 457.0,
|
|
"completions/mean_length": 234.921875,
|
|
"completions/mean_terminated_length": 234.921875,
|
|
"completions/min_length": 100.0,
|
|
"completions/min_terminated_length": 100.0,
|
|
"epoch": 0.816,
|
|
"grad_norm": 0.04220227152109146,
|
|
"learning_rate": 2.0462633451957296e-07,
|
|
"loss": -0.0111,
|
|
"num_tokens": 249668526.0,
|
|
"reward": 1.4461275339126587,
|
|
"reward_std": 0.17553001642227173,
|
|
"rewards/accuracy_reward_long_step": 0.5859375,
|
|
"rewards/final_brier_reward_long_step": 0.7025785446166992,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7381817102432251,
|
|
"step": 510
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 435.0,
|
|
"completions/max_terminated_length": 435.0,
|
|
"completions/mean_length": 223.1015625,
|
|
"completions/mean_terminated_length": 223.1015625,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 0.8176,
|
|
"grad_norm": 0.04910369962453842,
|
|
"learning_rate": 2.0284697508896798e-07,
|
|
"loss": 0.013,
|
|
"num_tokens": 250138040.0,
|
|
"reward": 1.480068325996399,
|
|
"reward_std": 0.17966505885124207,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.8528038263320923,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.9112191200256348,
|
|
"step": 511
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 389.0,
|
|
"completions/max_terminated_length": 389.0,
|
|
"completions/mean_length": 235.9375,
|
|
"completions/mean_terminated_length": 235.9375,
|
|
"completions/min_length": 111.0,
|
|
"completions/min_terminated_length": 111.0,
|
|
"epoch": 0.8192,
|
|
"grad_norm": 0.03783193230628967,
|
|
"learning_rate": 2.0106761565836297e-07,
|
|
"loss": 0.0023,
|
|
"num_tokens": 250629424.0,
|
|
"reward": 1.5700932741165161,
|
|
"reward_std": 0.12437284737825394,
|
|
"rewards/accuracy_reward_long_step": 0.66015625,
|
|
"rewards/final_brier_reward_long_step": 0.8810421824455261,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7587060928344727,
|
|
"step": 512
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 510.0,
|
|
"completions/max_terminated_length": 510.0,
|
|
"completions/mean_length": 255.25390625,
|
|
"completions/mean_terminated_length": 255.25390625,
|
|
"completions/min_length": 126.0,
|
|
"completions/min_terminated_length": 126.0,
|
|
"epoch": 0.8208,
|
|
"grad_norm": 0.052237872034311295,
|
|
"learning_rate": 1.99288256227758e-07,
|
|
"loss": -0.0014,
|
|
"num_tokens": 251129969.0,
|
|
"reward": 1.3335564136505127,
|
|
"reward_std": 0.13581520318984985,
|
|
"rewards/accuracy_reward_long_step": 0.453125,
|
|
"rewards/final_brier_reward_long_step": 0.7354176044464111,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7863079905509949,
|
|
"step": 513
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 520.0,
|
|
"completions/max_terminated_length": 520.0,
|
|
"completions/mean_length": 227.5546875,
|
|
"completions/mean_terminated_length": 227.5546875,
|
|
"completions/min_length": 122.0,
|
|
"completions/min_terminated_length": 122.0,
|
|
"epoch": 0.8224,
|
|
"grad_norm": 0.03678586706519127,
|
|
"learning_rate": 1.9750889679715302e-07,
|
|
"loss": -0.0089,
|
|
"num_tokens": 251612919.0,
|
|
"reward": 1.4214198589324951,
|
|
"reward_std": 0.1188969761133194,
|
|
"rewards/accuracy_reward_long_step": 0.52734375,
|
|
"rewards/final_brier_reward_long_step": 0.7470394372940063,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8292652368545532,
|
|
"step": 514
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 514.0,
|
|
"completions/max_terminated_length": 514.0,
|
|
"completions/mean_length": 245.015625,
|
|
"completions/mean_terminated_length": 245.015625,
|
|
"completions/min_length": 133.0,
|
|
"completions/min_terminated_length": 133.0,
|
|
"epoch": 0.824,
|
|
"grad_norm": 0.05070950463414192,
|
|
"learning_rate": 1.9572953736654804e-07,
|
|
"loss": 0.0063,
|
|
"num_tokens": 252120547.0,
|
|
"reward": 1.5003015995025635,
|
|
"reward_std": 0.14131146669387817,
|
|
"rewards/accuracy_reward_long_step": 0.609375,
|
|
"rewards/final_brier_reward_long_step": 0.7343014478683472,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8294050693511963,
|
|
"step": 515
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 586.0,
|
|
"completions/max_terminated_length": 586.0,
|
|
"completions/mean_length": 234.69921875,
|
|
"completions/mean_terminated_length": 234.69921875,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 0.8256,
|
|
"grad_norm": 0.04203731194138527,
|
|
"learning_rate": 1.9395017793594305e-07,
|
|
"loss": 0.0013,
|
|
"num_tokens": 252610774.0,
|
|
"reward": 1.4129828214645386,
|
|
"reward_std": 0.09008841961622238,
|
|
"rewards/accuracy_reward_long_step": 0.51171875,
|
|
"rewards/final_brier_reward_long_step": 0.7625414133071899,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8425147533416748,
|
|
"step": 516
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 516.0,
|
|
"completions/max_terminated_length": 516.0,
|
|
"completions/mean_length": 244.05078125,
|
|
"completions/mean_terminated_length": 244.05078125,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.8272,
|
|
"grad_norm": 0.04504576325416565,
|
|
"learning_rate": 1.9217081850533807e-07,
|
|
"loss": 0.0092,
|
|
"num_tokens": 253084835.0,
|
|
"reward": 1.3626664876937866,
|
|
"reward_std": 0.13317476212978363,
|
|
"rewards/accuracy_reward_long_step": 0.49609375,
|
|
"rewards/final_brier_reward_long_step": 0.7290824055671692,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7372086048126221,
|
|
"step": 517
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 535.0,
|
|
"completions/max_terminated_length": 535.0,
|
|
"completions/mean_length": 235.015625,
|
|
"completions/mean_terminated_length": 235.015625,
|
|
"completions/min_length": 103.0,
|
|
"completions/min_terminated_length": 103.0,
|
|
"epoch": 0.8288,
|
|
"grad_norm": 0.04393794387578964,
|
|
"learning_rate": 1.9039145907473308e-07,
|
|
"loss": 0.014,
|
|
"num_tokens": 253571311.0,
|
|
"reward": 1.5000429153442383,
|
|
"reward_std": 0.13177230954170227,
|
|
"rewards/accuracy_reward_long_step": 0.5859375,
|
|
"rewards/final_brier_reward_long_step": 0.7909968495368958,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8654246926307678,
|
|
"step": 518
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 596.0,
|
|
"completions/max_terminated_length": 596.0,
|
|
"completions/mean_length": 247.33203125,
|
|
"completions/mean_terminated_length": 247.33203125,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.8304,
|
|
"grad_norm": 0.03598223626613617,
|
|
"learning_rate": 1.8861209964412812e-07,
|
|
"loss": 0.0185,
|
|
"num_tokens": 254043356.0,
|
|
"reward": 1.3758127689361572,
|
|
"reward_std": 0.17646163702011108,
|
|
"rewards/accuracy_reward_long_step": 0.47265625,
|
|
"rewards/final_brier_reward_long_step": 0.7502039074897766,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8624222278594971,
|
|
"step": 519
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 550.0,
|
|
"completions/max_terminated_length": 550.0,
|
|
"completions/mean_length": 236.67578125,
|
|
"completions/mean_terminated_length": 236.67578125,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.832,
|
|
"grad_norm": 0.03368176519870758,
|
|
"learning_rate": 1.8683274021352314e-07,
|
|
"loss": 0.0061,
|
|
"num_tokens": 254522601.0,
|
|
"reward": 1.4897700548171997,
|
|
"reward_std": 0.10578904300928116,
|
|
"rewards/accuracy_reward_long_step": 0.58984375,
|
|
"rewards/final_brier_reward_long_step": 0.7538363337516785,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8458687663078308,
|
|
"step": 520
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 458.0,
|
|
"completions/max_terminated_length": 458.0,
|
|
"completions/mean_length": 230.203125,
|
|
"completions/mean_terminated_length": 230.203125,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 0.8336,
|
|
"grad_norm": 0.05814650282263756,
|
|
"learning_rate": 1.8505338078291812e-07,
|
|
"loss": 0.0063,
|
|
"num_tokens": 254993893.0,
|
|
"reward": 1.419898271560669,
|
|
"reward_std": 0.11159418523311615,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.7294105291366577,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7939323782920837,
|
|
"step": 521
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 633.0,
|
|
"completions/max_terminated_length": 633.0,
|
|
"completions/mean_length": 240.46875,
|
|
"completions/mean_terminated_length": 240.46875,
|
|
"completions/min_length": 136.0,
|
|
"completions/min_terminated_length": 136.0,
|
|
"epoch": 0.8352,
|
|
"grad_norm": 0.03419061005115509,
|
|
"learning_rate": 1.8327402135231316e-07,
|
|
"loss": -0.0096,
|
|
"num_tokens": 255490549.0,
|
|
"reward": 1.4004169702529907,
|
|
"reward_std": 0.12270954251289368,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.8064777851104736,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8889400959014893,
|
|
"step": 522
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 499.0,
|
|
"completions/max_terminated_length": 499.0,
|
|
"completions/mean_length": 242.21875,
|
|
"completions/mean_terminated_length": 242.21875,
|
|
"completions/min_length": 130.0,
|
|
"completions/min_terminated_length": 130.0,
|
|
"epoch": 0.8368,
|
|
"grad_norm": 0.036356884986162186,
|
|
"learning_rate": 1.8149466192170818e-07,
|
|
"loss": -0.0055,
|
|
"num_tokens": 255991109.0,
|
|
"reward": 1.4097753763198853,
|
|
"reward_std": 0.10558044910430908,
|
|
"rewards/accuracy_reward_long_step": 0.51953125,
|
|
"rewards/final_brier_reward_long_step": 0.7266496419906616,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8343270421028137,
|
|
"step": 523
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 494.0,
|
|
"completions/max_terminated_length": 494.0,
|
|
"completions/mean_length": 235.08203125,
|
|
"completions/mean_terminated_length": 235.08203125,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.8384,
|
|
"grad_norm": 0.04891684278845787,
|
|
"learning_rate": 1.797153024911032e-07,
|
|
"loss": 0.0014,
|
|
"num_tokens": 256465850.0,
|
|
"reward": 1.4318642616271973,
|
|
"reward_std": 0.12251758575439453,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.768867552280426,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8023396730422974,
|
|
"step": 524
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 477.0,
|
|
"completions/max_terminated_length": 477.0,
|
|
"completions/mean_length": 231.28125,
|
|
"completions/mean_terminated_length": 231.28125,
|
|
"completions/min_length": 119.0,
|
|
"completions/min_terminated_length": 119.0,
|
|
"epoch": 0.84,
|
|
"grad_norm": 0.04874948784708977,
|
|
"learning_rate": 1.7793594306049823e-07,
|
|
"loss": -0.0015,
|
|
"num_tokens": 256950562.0,
|
|
"reward": 1.454651117324829,
|
|
"reward_std": 0.15635967254638672,
|
|
"rewards/accuracy_reward_long_step": 0.578125,
|
|
"rewards/final_brier_reward_long_step": 0.686775803565979,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8193286657333374,
|
|
"step": 525
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 503.0,
|
|
"completions/max_terminated_length": 503.0,
|
|
"completions/mean_length": 247.87109375,
|
|
"completions/mean_terminated_length": 247.87109375,
|
|
"completions/min_length": 106.0,
|
|
"completions/min_terminated_length": 106.0,
|
|
"epoch": 0.8416,
|
|
"grad_norm": 0.05547276511788368,
|
|
"learning_rate": 1.7615658362989322e-07,
|
|
"loss": 0.0132,
|
|
"num_tokens": 257429993.0,
|
|
"reward": 1.3650845289230347,
|
|
"reward_std": 0.16451743245124817,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.779395341873169,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8059428930282593,
|
|
"step": 526
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 786.0,
|
|
"completions/max_terminated_length": 786.0,
|
|
"completions/mean_length": 250.98828125,
|
|
"completions/mean_terminated_length": 250.98828125,
|
|
"completions/min_length": 129.0,
|
|
"completions/min_terminated_length": 129.0,
|
|
"epoch": 0.8432,
|
|
"grad_norm": 0.035361409187316895,
|
|
"learning_rate": 1.7437722419928824e-07,
|
|
"loss": 0.0091,
|
|
"num_tokens": 257921134.0,
|
|
"reward": 1.50462007522583,
|
|
"reward_std": 0.12116993218660355,
|
|
"rewards/accuracy_reward_long_step": 0.60546875,
|
|
"rewards/final_brier_reward_long_step": 0.7998992204666138,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7967061400413513,
|
|
"step": 527
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 548.0,
|
|
"completions/max_terminated_length": 548.0,
|
|
"completions/mean_length": 253.18359375,
|
|
"completions/mean_terminated_length": 253.18359375,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.8448,
|
|
"grad_norm": 0.03830602765083313,
|
|
"learning_rate": 1.7259786476868328e-07,
|
|
"loss": -0.0055,
|
|
"num_tokens": 258402709.0,
|
|
"reward": 1.3311948776245117,
|
|
"reward_std": 0.11117161065340042,
|
|
"rewards/accuracy_reward_long_step": 0.46484375,
|
|
"rewards/final_brier_reward_long_step": 0.7396460771560669,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7257586717605591,
|
|
"step": 528
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 427.0,
|
|
"completions/max_terminated_length": 427.0,
|
|
"completions/mean_length": 249.9921875,
|
|
"completions/mean_terminated_length": 249.9921875,
|
|
"completions/min_length": 112.0,
|
|
"completions/min_terminated_length": 112.0,
|
|
"epoch": 0.8464,
|
|
"grad_norm": 0.04274242743849754,
|
|
"learning_rate": 1.708185053380783e-07,
|
|
"loss": 0.0085,
|
|
"num_tokens": 258892547.0,
|
|
"reward": 1.5267962217330933,
|
|
"reward_std": 0.10823096334934235,
|
|
"rewards/accuracy_reward_long_step": 0.609375,
|
|
"rewards/final_brier_reward_long_step": 0.8286827802658081,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8410018086433411,
|
|
"step": 529
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 533.0,
|
|
"completions/max_terminated_length": 533.0,
|
|
"completions/mean_length": 247.6875,
|
|
"completions/mean_terminated_length": 247.6875,
|
|
"completions/min_length": 139.0,
|
|
"completions/min_terminated_length": 139.0,
|
|
"epoch": 0.848,
|
|
"grad_norm": 0.046045657247304916,
|
|
"learning_rate": 1.690391459074733e-07,
|
|
"loss": -0.0093,
|
|
"num_tokens": 259389531.0,
|
|
"reward": 1.3362867832183838,
|
|
"reward_std": 0.10417808592319489,
|
|
"rewards/accuracy_reward_long_step": 0.4296875,
|
|
"rewards/final_brier_reward_long_step": 0.8625573515892029,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7638399600982666,
|
|
"step": 530
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 503.0,
|
|
"completions/max_terminated_length": 503.0,
|
|
"completions/mean_length": 237.73828125,
|
|
"completions/mean_terminated_length": 237.73828125,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.8496,
|
|
"grad_norm": 0.055190231651067734,
|
|
"learning_rate": 1.6725978647686832e-07,
|
|
"loss": -0.0043,
|
|
"num_tokens": 259886432.0,
|
|
"reward": 1.4131598472595215,
|
|
"reward_std": 0.12312982231378555,
|
|
"rewards/accuracy_reward_long_step": 0.54296875,
|
|
"rewards/final_brier_reward_long_step": 0.6990100145339966,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7817543745040894,
|
|
"step": 531
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 386.0,
|
|
"completions/max_terminated_length": 386.0,
|
|
"completions/mean_length": 232.515625,
|
|
"completions/mean_terminated_length": 232.515625,
|
|
"completions/min_length": 108.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.8512,
|
|
"grad_norm": 0.03550275042653084,
|
|
"learning_rate": 1.6548042704626334e-07,
|
|
"loss": -0.0068,
|
|
"num_tokens": 260358252.0,
|
|
"reward": 1.4527807235717773,
|
|
"reward_std": 0.13987179100513458,
|
|
"rewards/accuracy_reward_long_step": 0.578125,
|
|
"rewards/final_brier_reward_long_step": 0.7386081218719482,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7678276300430298,
|
|
"step": 532
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 606.0,
|
|
"completions/max_terminated_length": 606.0,
|
|
"completions/mean_length": 237.8671875,
|
|
"completions/mean_terminated_length": 238.80001831054688,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 94.0,
|
|
"epoch": 0.8528,
|
|
"grad_norm": 0.040301430970430374,
|
|
"learning_rate": 1.6370106761565835e-07,
|
|
"loss": -0.024,
|
|
"num_tokens": 260832714.0,
|
|
"reward": 1.428901195526123,
|
|
"reward_std": 0.09275516867637634,
|
|
"rewards/accuracy_reward_long_step": 0.5234375,
|
|
"rewards/final_brier_reward_long_step": 0.776361346244812,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8533056974411011,
|
|
"step": 533
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 462.0,
|
|
"completions/max_terminated_length": 462.0,
|
|
"completions/mean_length": 238.7578125,
|
|
"completions/mean_terminated_length": 238.7578125,
|
|
"completions/min_length": 133.0,
|
|
"completions/min_terminated_length": 133.0,
|
|
"epoch": 0.8544,
|
|
"grad_norm": 0.04402696341276169,
|
|
"learning_rate": 1.619217081850534e-07,
|
|
"loss": 0.0134,
|
|
"num_tokens": 261320468.0,
|
|
"reward": 1.4897812604904175,
|
|
"reward_std": 0.09281490743160248,
|
|
"rewards/accuracy_reward_long_step": 0.59765625,
|
|
"rewards/final_brier_reward_long_step": 0.7256316542625427,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.842868447303772,
|
|
"step": 534
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00390625,
|
|
"completions/max_length": 517.0,
|
|
"completions/max_terminated_length": 517.0,
|
|
"completions/mean_length": 227.56640625,
|
|
"completions/mean_terminated_length": 228.45883178710938,
|
|
"completions/min_length": 0.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.856,
|
|
"grad_norm": 0.04461900517344475,
|
|
"learning_rate": 1.601423487544484e-07,
|
|
"loss": -0.0036,
|
|
"num_tokens": 261802045.0,
|
|
"reward": 1.5405223369598389,
|
|
"reward_std": 0.1693045049905777,
|
|
"rewards/accuracy_reward_long_step": 0.66015625,
|
|
"rewards/final_brier_reward_long_step": 0.6963077783584595,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8329689502716064,
|
|
"step": 535
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 630.0,
|
|
"completions/max_terminated_length": 630.0,
|
|
"completions/mean_length": 243.41015625,
|
|
"completions/mean_terminated_length": 243.41015625,
|
|
"completions/min_length": 140.0,
|
|
"completions/min_terminated_length": 140.0,
|
|
"epoch": 0.8576,
|
|
"grad_norm": 0.0438154935836792,
|
|
"learning_rate": 1.583629893238434e-07,
|
|
"loss": 0.0014,
|
|
"num_tokens": 262282190.0,
|
|
"reward": 1.3707561492919922,
|
|
"reward_std": 0.16878756880760193,
|
|
"rewards/accuracy_reward_long_step": 0.4609375,
|
|
"rewards/final_brier_reward_long_step": 0.7976784706115723,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8415963649749756,
|
|
"step": 536
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 528.0,
|
|
"completions/max_terminated_length": 528.0,
|
|
"completions/mean_length": 233.390625,
|
|
"completions/mean_terminated_length": 233.390625,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.8592,
|
|
"grad_norm": 0.04220299795269966,
|
|
"learning_rate": 1.5658362989323843e-07,
|
|
"loss": 0.0062,
|
|
"num_tokens": 262769162.0,
|
|
"reward": 1.5402344465255737,
|
|
"reward_std": 0.12223983556032181,
|
|
"rewards/accuracy_reward_long_step": 0.609375,
|
|
"rewards/final_brier_reward_long_step": 0.8557167053222656,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8677208423614502,
|
|
"step": 537
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 601.0,
|
|
"completions/max_terminated_length": 601.0,
|
|
"completions/mean_length": 235.33203125,
|
|
"completions/mean_terminated_length": 235.33203125,
|
|
"completions/min_length": 140.0,
|
|
"completions/min_terminated_length": 140.0,
|
|
"epoch": 0.8608,
|
|
"grad_norm": 0.03791709989309311,
|
|
"learning_rate": 1.5480427046263345e-07,
|
|
"loss": -0.0013,
|
|
"num_tokens": 263247303.0,
|
|
"reward": 1.5486342906951904,
|
|
"reward_std": 0.11805526912212372,
|
|
"rewards/accuracy_reward_long_step": 0.65625,
|
|
"rewards/final_brier_reward_long_step": 0.7473232746124268,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8222134709358215,
|
|
"step": 538
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 540.0,
|
|
"completions/max_terminated_length": 540.0,
|
|
"completions/mean_length": 238.59375,
|
|
"completions/mean_terminated_length": 238.59375,
|
|
"completions/min_length": 143.0,
|
|
"completions/min_terminated_length": 143.0,
|
|
"epoch": 0.8624,
|
|
"grad_norm": 0.04007653519511223,
|
|
"learning_rate": 1.5302491103202846e-07,
|
|
"loss": -0.0057,
|
|
"num_tokens": 263739847.0,
|
|
"reward": 1.5600248575210571,
|
|
"reward_std": 0.14350242912769318,
|
|
"rewards/accuracy_reward_long_step": 0.67578125,
|
|
"rewards/final_brier_reward_long_step": 0.7258714437484741,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8111032247543335,
|
|
"step": 539
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 541.0,
|
|
"completions/max_terminated_length": 541.0,
|
|
"completions/mean_length": 250.05859375,
|
|
"completions/mean_terminated_length": 250.05859375,
|
|
"completions/min_length": 151.0,
|
|
"completions/min_terminated_length": 151.0,
|
|
"epoch": 0.864,
|
|
"grad_norm": 0.03347768262028694,
|
|
"learning_rate": 1.512455516014235e-07,
|
|
"loss": -0.0126,
|
|
"num_tokens": 264235814.0,
|
|
"reward": 1.495011568069458,
|
|
"reward_std": 0.08270839601755142,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.8477886915206909,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8822580575942993,
|
|
"step": 540
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 613.0,
|
|
"completions/max_terminated_length": 613.0,
|
|
"completions/mean_length": 261.34375,
|
|
"completions/mean_terminated_length": 261.34375,
|
|
"completions/min_length": 147.0,
|
|
"completions/min_terminated_length": 147.0,
|
|
"epoch": 0.8656,
|
|
"grad_norm": 0.03869554027915001,
|
|
"learning_rate": 1.494661921708185e-07,
|
|
"loss": -0.0073,
|
|
"num_tokens": 264741902.0,
|
|
"reward": 1.3477532863616943,
|
|
"reward_std": 0.1298760026693344,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.7019554376602173,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8140577077865601,
|
|
"step": 541
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 724.0,
|
|
"completions/max_terminated_length": 724.0,
|
|
"completions/mean_length": 240.37890625,
|
|
"completions/mean_terminated_length": 240.37890625,
|
|
"completions/min_length": 89.0,
|
|
"completions/min_terminated_length": 89.0,
|
|
"epoch": 0.8672,
|
|
"grad_norm": 0.036735206842422485,
|
|
"learning_rate": 1.476868327402135e-07,
|
|
"loss": 0.0053,
|
|
"num_tokens": 265240223.0,
|
|
"reward": 1.2603323459625244,
|
|
"reward_std": 0.05455077812075615,
|
|
"rewards/accuracy_reward_long_step": 0.375,
|
|
"rewards/final_brier_reward_long_step": 0.7330679893493652,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8082613945007324,
|
|
"step": 542
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 865.0,
|
|
"completions/max_terminated_length": 865.0,
|
|
"completions/mean_length": 226.53515625,
|
|
"completions/mean_terminated_length": 226.53515625,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.8688,
|
|
"grad_norm": 0.04960142448544502,
|
|
"learning_rate": 1.4590747330960855e-07,
|
|
"loss": 0.0033,
|
|
"num_tokens": 265742248.0,
|
|
"reward": 1.5600826740264893,
|
|
"reward_std": 0.10487354546785355,
|
|
"rewards/accuracy_reward_long_step": 0.65625,
|
|
"rewards/final_brier_reward_long_step": 0.7979686260223389,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8173620700836182,
|
|
"step": 543
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 665.0,
|
|
"completions/max_terminated_length": 665.0,
|
|
"completions/mean_length": 251.0546875,
|
|
"completions/mean_terminated_length": 251.0546875,
|
|
"completions/min_length": 102.0,
|
|
"completions/min_terminated_length": 102.0,
|
|
"epoch": 0.8704,
|
|
"grad_norm": 0.03681975230574608,
|
|
"learning_rate": 1.4412811387900356e-07,
|
|
"loss": 0.0118,
|
|
"num_tokens": 266235870.0,
|
|
"reward": 1.3697454929351807,
|
|
"reward_std": 0.14522971212863922,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.7746487855911255,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.798083484172821,
|
|
"step": 544
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 459.0,
|
|
"completions/max_terminated_length": 459.0,
|
|
"completions/mean_length": 229.875,
|
|
"completions/mean_terminated_length": 229.875,
|
|
"completions/min_length": 110.0,
|
|
"completions/min_terminated_length": 110.0,
|
|
"epoch": 0.872,
|
|
"grad_norm": 0.03889426216483116,
|
|
"learning_rate": 1.4234875444839858e-07,
|
|
"loss": -0.0055,
|
|
"num_tokens": 266730838.0,
|
|
"reward": 1.3409353494644165,
|
|
"reward_std": 0.18033601343631744,
|
|
"rewards/accuracy_reward_long_step": 0.46484375,
|
|
"rewards/final_brier_reward_long_step": 0.7062370777130127,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7981289625167847,
|
|
"step": 545
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 483.0,
|
|
"completions/max_terminated_length": 483.0,
|
|
"completions/mean_length": 254.52734375,
|
|
"completions/mean_terminated_length": 254.52734375,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.8736,
|
|
"grad_norm": 0.04387963190674782,
|
|
"learning_rate": 1.405693950177936e-07,
|
|
"loss": 0.0065,
|
|
"num_tokens": 267218405.0,
|
|
"reward": 1.3384535312652588,
|
|
"reward_std": 0.112638920545578,
|
|
"rewards/accuracy_reward_long_step": 0.44921875,
|
|
"rewards/final_brier_reward_long_step": 0.725222647190094,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8317165374755859,
|
|
"step": 546
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 579.0,
|
|
"completions/max_terminated_length": 579.0,
|
|
"completions/mean_length": 239.703125,
|
|
"completions/mean_terminated_length": 239.703125,
|
|
"completions/min_length": 126.0,
|
|
"completions/min_terminated_length": 126.0,
|
|
"epoch": 0.8752,
|
|
"grad_norm": 0.03998541459441185,
|
|
"learning_rate": 1.387900355871886e-07,
|
|
"loss": -0.0061,
|
|
"num_tokens": 267706177.0,
|
|
"reward": 1.5569477081298828,
|
|
"reward_std": 0.1203605979681015,
|
|
"rewards/accuracy_reward_long_step": 0.6484375,
|
|
"rewards/final_brier_reward_long_step": 0.8150613307952881,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8189792633056641,
|
|
"step": 547
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 505.0,
|
|
"completions/max_terminated_length": 505.0,
|
|
"completions/mean_length": 247.90625,
|
|
"completions/mean_terminated_length": 247.90625,
|
|
"completions/min_length": 137.0,
|
|
"completions/min_terminated_length": 137.0,
|
|
"epoch": 0.8768,
|
|
"grad_norm": 0.040950994938611984,
|
|
"learning_rate": 1.3701067615658362e-07,
|
|
"loss": -0.0104,
|
|
"num_tokens": 268198305.0,
|
|
"reward": 1.3657258749008179,
|
|
"reward_std": 0.1606340855360031,
|
|
"rewards/accuracy_reward_long_step": 0.44140625,
|
|
"rewards/final_brier_reward_long_step": 0.8146769404411316,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8826013803482056,
|
|
"step": 548
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 416.0,
|
|
"completions/max_terminated_length": 416.0,
|
|
"completions/mean_length": 231.60546875,
|
|
"completions/mean_terminated_length": 231.60546875,
|
|
"completions/min_length": 135.0,
|
|
"completions/min_terminated_length": 135.0,
|
|
"epoch": 0.8784,
|
|
"grad_norm": 0.050138987600803375,
|
|
"learning_rate": 1.3523131672597866e-07,
|
|
"loss": -0.0009,
|
|
"num_tokens": 268659020.0,
|
|
"reward": 1.3868610858917236,
|
|
"reward_std": 0.1097867488861084,
|
|
"rewards/accuracy_reward_long_step": 0.49609375,
|
|
"rewards/final_brier_reward_long_step": 0.715602695941925,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8474666476249695,
|
|
"step": 549
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 551.0,
|
|
"completions/max_terminated_length": 551.0,
|
|
"completions/mean_length": 224.76171875,
|
|
"completions/mean_terminated_length": 224.76171875,
|
|
"completions/min_length": 108.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.88,
|
|
"grad_norm": 0.058884453028440475,
|
|
"learning_rate": 1.3345195729537365e-07,
|
|
"loss": 0.0061,
|
|
"num_tokens": 269130471.0,
|
|
"reward": 1.539604902267456,
|
|
"reward_std": 0.15575310587882996,
|
|
"rewards/accuracy_reward_long_step": 0.61328125,
|
|
"rewards/final_brier_reward_long_step": 0.8004753589630127,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.904819130897522,
|
|
"step": 550
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 574.0,
|
|
"completions/max_terminated_length": 574.0,
|
|
"completions/mean_length": 232.984375,
|
|
"completions/mean_terminated_length": 232.984375,
|
|
"completions/min_length": 135.0,
|
|
"completions/min_terminated_length": 135.0,
|
|
"epoch": 0.8816,
|
|
"grad_norm": 0.04245281219482422,
|
|
"learning_rate": 1.3167259786476866e-07,
|
|
"loss": -0.0042,
|
|
"num_tokens": 269601147.0,
|
|
"reward": 1.1472269296646118,
|
|
"reward_std": 0.12594836950302124,
|
|
"rewards/accuracy_reward_long_step": 0.28125,
|
|
"rewards/final_brier_reward_long_step": 0.6631394624710083,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.800768256187439,
|
|
"step": 551
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 400.0,
|
|
"completions/max_terminated_length": 400.0,
|
|
"completions/mean_length": 230.94140625,
|
|
"completions/mean_terminated_length": 230.94140625,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 0.8832,
|
|
"grad_norm": 0.0368904173374176,
|
|
"learning_rate": 1.298932384341637e-07,
|
|
"loss": 0.0065,
|
|
"num_tokens": 270091988.0,
|
|
"reward": 1.429011344909668,
|
|
"reward_std": 0.12329679727554321,
|
|
"rewards/accuracy_reward_long_step": 0.546875,
|
|
"rewards/final_brier_reward_long_step": 0.7112011313438416,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8173440098762512,
|
|
"step": 552
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 435.0,
|
|
"completions/max_terminated_length": 435.0,
|
|
"completions/mean_length": 228.328125,
|
|
"completions/mean_terminated_length": 228.328125,
|
|
"completions/min_length": 137.0,
|
|
"completions/min_terminated_length": 137.0,
|
|
"epoch": 0.8848,
|
|
"grad_norm": 0.03731711953878403,
|
|
"learning_rate": 1.2811387900355872e-07,
|
|
"loss": 0.0103,
|
|
"num_tokens": 270583416.0,
|
|
"reward": 1.5979522466659546,
|
|
"reward_std": 0.10820707678794861,
|
|
"rewards/accuracy_reward_long_step": 0.68359375,
|
|
"rewards/final_brier_reward_long_step": 0.8128556609153748,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8445781469345093,
|
|
"step": 553
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 521.0,
|
|
"completions/max_terminated_length": 521.0,
|
|
"completions/mean_length": 237.7421875,
|
|
"completions/mean_terminated_length": 237.7421875,
|
|
"completions/min_length": 122.0,
|
|
"completions/min_terminated_length": 122.0,
|
|
"epoch": 0.8864,
|
|
"grad_norm": 0.03969436511397362,
|
|
"learning_rate": 1.2633451957295373e-07,
|
|
"loss": -0.0071,
|
|
"num_tokens": 271076430.0,
|
|
"reward": 1.3755825757980347,
|
|
"reward_std": 0.11075370013713837,
|
|
"rewards/accuracy_reward_long_step": 0.48046875,
|
|
"rewards/final_brier_reward_long_step": 0.7792448997497559,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8012101054191589,
|
|
"step": 554
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 494.0,
|
|
"completions/max_terminated_length": 494.0,
|
|
"completions/mean_length": 246.546875,
|
|
"completions/mean_terminated_length": 246.546875,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 0.888,
|
|
"grad_norm": 0.03638119623064995,
|
|
"learning_rate": 1.2455516014234875e-07,
|
|
"loss": 0.0094,
|
|
"num_tokens": 271573154.0,
|
|
"reward": 1.4438494443893433,
|
|
"reward_std": 0.14020705223083496,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.7870507836341858,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8320969343185425,
|
|
"step": 555
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 503.0,
|
|
"completions/max_terminated_length": 503.0,
|
|
"completions/mean_length": 234.90234375,
|
|
"completions/mean_terminated_length": 234.90234375,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.8896,
|
|
"grad_norm": 0.045414846390485764,
|
|
"learning_rate": 1.2277580071174376e-07,
|
|
"loss": 0.0053,
|
|
"num_tokens": 272051201.0,
|
|
"reward": 1.4294031858444214,
|
|
"reward_std": 0.07926599681377411,
|
|
"rewards/accuracy_reward_long_step": 0.52734375,
|
|
"rewards/final_brier_reward_long_step": 0.8273754119873047,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7808624505996704,
|
|
"step": 556
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 735.0,
|
|
"completions/max_terminated_length": 735.0,
|
|
"completions/mean_length": 248.7890625,
|
|
"completions/mean_terminated_length": 248.7890625,
|
|
"completions/min_length": 146.0,
|
|
"completions/min_terminated_length": 146.0,
|
|
"epoch": 0.8912,
|
|
"grad_norm": 0.042798083275556564,
|
|
"learning_rate": 1.2099644128113878e-07,
|
|
"loss": -0.0078,
|
|
"num_tokens": 272544787.0,
|
|
"reward": 1.5465130805969238,
|
|
"reward_std": 0.09747041761875153,
|
|
"rewards/accuracy_reward_long_step": 0.65234375,
|
|
"rewards/final_brier_reward_long_step": 0.7800741791725159,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7966029644012451,
|
|
"step": 557
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 536.0,
|
|
"completions/max_terminated_length": 536.0,
|
|
"completions/mean_length": 231.265625,
|
|
"completions/mean_terminated_length": 231.265625,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.8928,
|
|
"grad_norm": 0.03589711710810661,
|
|
"learning_rate": 1.1921708185053382e-07,
|
|
"loss": -0.0023,
|
|
"num_tokens": 273043191.0,
|
|
"reward": 1.419055461883545,
|
|
"reward_std": 0.12412445992231369,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.8063081502914429,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8699132204055786,
|
|
"step": 558
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 465.0,
|
|
"completions/max_terminated_length": 465.0,
|
|
"completions/mean_length": 234.390625,
|
|
"completions/mean_terminated_length": 234.390625,
|
|
"completions/min_length": 125.0,
|
|
"completions/min_terminated_length": 125.0,
|
|
"epoch": 0.8944,
|
|
"grad_norm": 0.0373535081744194,
|
|
"learning_rate": 1.1743772241992882e-07,
|
|
"loss": -0.0117,
|
|
"num_tokens": 273519939.0,
|
|
"reward": 1.3120028972625732,
|
|
"reward_std": 0.15166430175304413,
|
|
"rewards/accuracy_reward_long_step": 0.44140625,
|
|
"rewards/final_brier_reward_long_step": 0.6639257669448853,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8184609413146973,
|
|
"step": 559
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 421.0,
|
|
"completions/max_terminated_length": 421.0,
|
|
"completions/mean_length": 229.53515625,
|
|
"completions/mean_terminated_length": 229.53515625,
|
|
"completions/min_length": 105.0,
|
|
"completions/min_terminated_length": 105.0,
|
|
"epoch": 0.896,
|
|
"grad_norm": 0.043007586151361465,
|
|
"learning_rate": 1.1565836298932385e-07,
|
|
"loss": 0.0086,
|
|
"num_tokens": 274002692.0,
|
|
"reward": 1.4279296398162842,
|
|
"reward_std": 0.13204392790794373,
|
|
"rewards/accuracy_reward_long_step": 0.546875,
|
|
"rewards/final_brier_reward_long_step": 0.721155047416687,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8030637502670288,
|
|
"step": 560
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 500.0,
|
|
"completions/max_terminated_length": 500.0,
|
|
"completions/mean_length": 241.80078125,
|
|
"completions/mean_terminated_length": 241.80078125,
|
|
"completions/min_length": 111.0,
|
|
"completions/min_terminated_length": 111.0,
|
|
"epoch": 0.8976,
|
|
"grad_norm": 0.04521722346544266,
|
|
"learning_rate": 1.1387900355871885e-07,
|
|
"loss": 0.0018,
|
|
"num_tokens": 274482809.0,
|
|
"reward": 1.4112411737442017,
|
|
"reward_std": 0.08869924396276474,
|
|
"rewards/accuracy_reward_long_step": 0.50390625,
|
|
"rewards/final_brier_reward_long_step": 0.8020182847976685,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8273211717605591,
|
|
"step": 561
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 436.0,
|
|
"completions/max_terminated_length": 436.0,
|
|
"completions/mean_length": 237.63671875,
|
|
"completions/mean_terminated_length": 237.63671875,
|
|
"completions/min_length": 118.0,
|
|
"completions/min_terminated_length": 118.0,
|
|
"epoch": 0.8992,
|
|
"grad_norm": 0.04603974521160126,
|
|
"learning_rate": 1.1209964412811388e-07,
|
|
"loss": 0.011,
|
|
"num_tokens": 274960236.0,
|
|
"reward": 1.3271350860595703,
|
|
"reward_std": 0.0897248238325119,
|
|
"rewards/accuracy_reward_long_step": 0.43359375,
|
|
"rewards/final_brier_reward_long_step": 0.7432428598403931,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8309223055839539,
|
|
"step": 562
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 481.0,
|
|
"completions/max_terminated_length": 481.0,
|
|
"completions/mean_length": 244.44140625,
|
|
"completions/mean_terminated_length": 244.44140625,
|
|
"completions/min_length": 126.0,
|
|
"completions/min_terminated_length": 126.0,
|
|
"epoch": 0.9008,
|
|
"grad_norm": 0.04642568156123161,
|
|
"learning_rate": 1.103202846975089e-07,
|
|
"loss": -0.0195,
|
|
"num_tokens": 275447389.0,
|
|
"reward": 1.245979905128479,
|
|
"reward_std": 0.11592195183038712,
|
|
"rewards/accuracy_reward_long_step": 0.3828125,
|
|
"rewards/final_brier_reward_long_step": 0.720180869102478,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7324886322021484,
|
|
"step": 563
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 421.0,
|
|
"completions/max_terminated_length": 421.0,
|
|
"completions/mean_length": 218.67578125,
|
|
"completions/mean_terminated_length": 218.67578125,
|
|
"completions/min_length": 123.0,
|
|
"completions/min_terminated_length": 123.0,
|
|
"epoch": 0.9024,
|
|
"grad_norm": 0.043522898107767105,
|
|
"learning_rate": 1.085409252669039e-07,
|
|
"loss": 0.0223,
|
|
"num_tokens": 275928954.0,
|
|
"reward": 1.5771305561065674,
|
|
"reward_std": 0.11938679218292236,
|
|
"rewards/accuracy_reward_long_step": 0.671875,
|
|
"rewards/final_brier_reward_long_step": 0.7716304063796997,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8493919372558594,
|
|
"step": 564
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 623.0,
|
|
"completions/max_terminated_length": 623.0,
|
|
"completions/mean_length": 227.44921875,
|
|
"completions/mean_terminated_length": 227.44921875,
|
|
"completions/min_length": 129.0,
|
|
"completions/min_terminated_length": 129.0,
|
|
"epoch": 0.904,
|
|
"grad_norm": 0.037754617631435394,
|
|
"learning_rate": 1.0676156583629893e-07,
|
|
"loss": 0.0003,
|
|
"num_tokens": 276409509.0,
|
|
"reward": 1.6633574962615967,
|
|
"reward_std": 0.13154950737953186,
|
|
"rewards/accuracy_reward_long_step": 0.75390625,
|
|
"rewards/final_brier_reward_long_step": 0.8070245981216431,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8385924696922302,
|
|
"step": 565
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 627.0,
|
|
"completions/max_terminated_length": 627.0,
|
|
"completions/mean_length": 245.3046875,
|
|
"completions/mean_terminated_length": 245.3046875,
|
|
"completions/min_length": 126.0,
|
|
"completions/min_terminated_length": 126.0,
|
|
"epoch": 0.9056,
|
|
"grad_norm": 0.04103442654013634,
|
|
"learning_rate": 1.0498220640569395e-07,
|
|
"loss": 0.0011,
|
|
"num_tokens": 276899091.0,
|
|
"reward": 1.378248691558838,
|
|
"reward_std": 0.1689617931842804,
|
|
"rewards/accuracy_reward_long_step": 0.48828125,
|
|
"rewards/final_brier_reward_long_step": 0.7711043357849121,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.788765549659729,
|
|
"step": 566
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 593.0,
|
|
"completions/max_terminated_length": 593.0,
|
|
"completions/mean_length": 254.94140625,
|
|
"completions/mean_terminated_length": 254.94140625,
|
|
"completions/min_length": 92.0,
|
|
"completions/min_terminated_length": 92.0,
|
|
"epoch": 0.9072,
|
|
"grad_norm": 0.03693830594420433,
|
|
"learning_rate": 1.0320284697508896e-07,
|
|
"loss": -0.0089,
|
|
"num_tokens": 277401420.0,
|
|
"reward": 1.2199474573135376,
|
|
"reward_std": 0.18783383071422577,
|
|
"rewards/accuracy_reward_long_step": 0.3515625,
|
|
"rewards/final_brier_reward_long_step": 0.6998242139816284,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7737154960632324,
|
|
"step": 567
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 618.0,
|
|
"completions/max_terminated_length": 618.0,
|
|
"completions/mean_length": 240.95703125,
|
|
"completions/mean_terminated_length": 240.95703125,
|
|
"completions/min_length": 101.0,
|
|
"completions/min_terminated_length": 101.0,
|
|
"epoch": 0.9088,
|
|
"grad_norm": 0.04158253222703934,
|
|
"learning_rate": 1.0142348754448399e-07,
|
|
"loss": 0.0082,
|
|
"num_tokens": 277887441.0,
|
|
"reward": 1.4272714853286743,
|
|
"reward_std": 0.17625702917575836,
|
|
"rewards/accuracy_reward_long_step": 0.5390625,
|
|
"rewards/final_brier_reward_long_step": 0.776642918586731,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7761929035186768,
|
|
"step": 568
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 536.0,
|
|
"completions/max_terminated_length": 536.0,
|
|
"completions/mean_length": 231.8125,
|
|
"completions/mean_terminated_length": 231.8125,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.9104,
|
|
"grad_norm": 0.0454368069767952,
|
|
"learning_rate": 9.9644128113879e-08,
|
|
"loss": 0.0031,
|
|
"num_tokens": 278370153.0,
|
|
"reward": 1.3535585403442383,
|
|
"reward_std": 0.09445783495903015,
|
|
"rewards/accuracy_reward_long_step": 0.43359375,
|
|
"rewards/final_brier_reward_long_step": 0.843848466873169,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8360108137130737,
|
|
"step": 569
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 452.0,
|
|
"completions/max_terminated_length": 452.0,
|
|
"completions/mean_length": 228.94921875,
|
|
"completions/mean_terminated_length": 228.94921875,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 0.912,
|
|
"grad_norm": 0.03921419754624367,
|
|
"learning_rate": 9.786476868327402e-08,
|
|
"loss": 0.0004,
|
|
"num_tokens": 278864260.0,
|
|
"reward": 1.426851749420166,
|
|
"reward_std": 0.18421000242233276,
|
|
"rewards/accuracy_reward_long_step": 0.54296875,
|
|
"rewards/final_brier_reward_long_step": 0.7497539520263672,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7857784032821655,
|
|
"step": 570
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 418.0,
|
|
"completions/max_terminated_length": 418.0,
|
|
"completions/mean_length": 236.28125,
|
|
"completions/mean_terminated_length": 236.28125,
|
|
"completions/min_length": 137.0,
|
|
"completions/min_terminated_length": 137.0,
|
|
"epoch": 0.9136,
|
|
"grad_norm": 0.03796105086803436,
|
|
"learning_rate": 9.608540925266903e-08,
|
|
"loss": -0.0075,
|
|
"num_tokens": 279345732.0,
|
|
"reward": 1.2733724117279053,
|
|
"reward_std": 0.17839360237121582,
|
|
"rewards/accuracy_reward_long_step": 0.41015625,
|
|
"rewards/final_brier_reward_long_step": 0.6511929631233215,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8016713857650757,
|
|
"step": 571
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 525.0,
|
|
"completions/max_terminated_length": 525.0,
|
|
"completions/mean_length": 243.11328125,
|
|
"completions/mean_terminated_length": 243.11328125,
|
|
"completions/min_length": 116.0,
|
|
"completions/min_terminated_length": 116.0,
|
|
"epoch": 0.9152,
|
|
"grad_norm": 0.03715438395738602,
|
|
"learning_rate": 9.430604982206406e-08,
|
|
"loss": 0.0104,
|
|
"num_tokens": 279841417.0,
|
|
"reward": 1.3816213607788086,
|
|
"reward_std": 0.12980535626411438,
|
|
"rewards/accuracy_reward_long_step": 0.48828125,
|
|
"rewards/final_brier_reward_long_step": 0.7918597459793091,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7815006971359253,
|
|
"step": 572
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 532.0,
|
|
"completions/max_terminated_length": 532.0,
|
|
"completions/mean_length": 242.07421875,
|
|
"completions/mean_terminated_length": 242.07421875,
|
|
"completions/min_length": 115.0,
|
|
"completions/min_terminated_length": 115.0,
|
|
"epoch": 0.9168,
|
|
"grad_norm": 0.04261266440153122,
|
|
"learning_rate": 9.252669039145906e-08,
|
|
"loss": -0.0052,
|
|
"num_tokens": 280341788.0,
|
|
"reward": 1.4158134460449219,
|
|
"reward_std": 0.09869112074375153,
|
|
"rewards/accuracy_reward_long_step": 0.50390625,
|
|
"rewards/final_brier_reward_long_step": 0.8082069754600525,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8394216299057007,
|
|
"step": 573
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 439.0,
|
|
"completions/max_terminated_length": 439.0,
|
|
"completions/mean_length": 226.296875,
|
|
"completions/mean_terminated_length": 226.296875,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.9184,
|
|
"grad_norm": 0.07661443948745728,
|
|
"learning_rate": 9.074733096085409e-08,
|
|
"loss": -0.0077,
|
|
"num_tokens": 280823224.0,
|
|
"reward": 1.532405138015747,
|
|
"reward_std": 0.08156967163085938,
|
|
"rewards/accuracy_reward_long_step": 0.64453125,
|
|
"rewards/final_brier_reward_long_step": 0.7845557928085327,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.766939640045166,
|
|
"step": 574
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 708.0,
|
|
"completions/max_terminated_length": 708.0,
|
|
"completions/mean_length": 240.23046875,
|
|
"completions/mean_terminated_length": 240.23046875,
|
|
"completions/min_length": 138.0,
|
|
"completions/min_terminated_length": 138.0,
|
|
"epoch": 0.92,
|
|
"grad_norm": 0.05118957906961441,
|
|
"learning_rate": 8.896797153024912e-08,
|
|
"loss": -0.01,
|
|
"num_tokens": 281322563.0,
|
|
"reward": 1.353409767150879,
|
|
"reward_std": 0.12275659292936325,
|
|
"rewards/accuracy_reward_long_step": 0.44921875,
|
|
"rewards/final_brier_reward_long_step": 0.7173187732696533,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8994452357292175,
|
|
"step": 575
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 577.0,
|
|
"completions/max_terminated_length": 577.0,
|
|
"completions/mean_length": 236.58984375,
|
|
"completions/mean_terminated_length": 236.58984375,
|
|
"completions/min_length": 151.0,
|
|
"completions/min_terminated_length": 151.0,
|
|
"epoch": 0.9216,
|
|
"grad_norm": 0.044467244297266006,
|
|
"learning_rate": 8.718861209964412e-08,
|
|
"loss": 0.0093,
|
|
"num_tokens": 281803666.0,
|
|
"reward": 1.394946575164795,
|
|
"reward_std": 0.09921001642942429,
|
|
"rewards/accuracy_reward_long_step": 0.515625,
|
|
"rewards/final_brier_reward_long_step": 0.771274209022522,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7460117340087891,
|
|
"step": 576
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 374.0,
|
|
"completions/max_terminated_length": 374.0,
|
|
"completions/mean_length": 219.89453125,
|
|
"completions/mean_terminated_length": 219.89453125,
|
|
"completions/min_length": 135.0,
|
|
"completions/min_terminated_length": 135.0,
|
|
"epoch": 0.9232,
|
|
"grad_norm": 0.04329831525683403,
|
|
"learning_rate": 8.540925266903915e-08,
|
|
"loss": -0.0061,
|
|
"num_tokens": 282247735.0,
|
|
"reward": 1.3368597030639648,
|
|
"reward_std": 0.12454381585121155,
|
|
"rewards/accuracy_reward_long_step": 0.46484375,
|
|
"rewards/final_brier_reward_long_step": 0.7124074101448059,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7756561636924744,
|
|
"step": 577
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 447.0,
|
|
"completions/max_terminated_length": 447.0,
|
|
"completions/mean_length": 232.28515625,
|
|
"completions/mean_terminated_length": 232.28515625,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.9248,
|
|
"grad_norm": 0.043502867221832275,
|
|
"learning_rate": 8.362989323843416e-08,
|
|
"loss": -0.0013,
|
|
"num_tokens": 282739784.0,
|
|
"reward": 1.2974281311035156,
|
|
"reward_std": 0.17717978358268738,
|
|
"rewards/accuracy_reward_long_step": 0.4453125,
|
|
"rewards/final_brier_reward_long_step": 0.6866112947463989,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7218515872955322,
|
|
"step": 578
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 520.0,
|
|
"completions/max_terminated_length": 520.0,
|
|
"completions/mean_length": 229.37890625,
|
|
"completions/mean_terminated_length": 229.37890625,
|
|
"completions/min_length": 118.0,
|
|
"completions/min_terminated_length": 118.0,
|
|
"epoch": 0.9264,
|
|
"grad_norm": 0.038043197244405746,
|
|
"learning_rate": 8.185053380782917e-08,
|
|
"loss": 0.0001,
|
|
"num_tokens": 283228401.0,
|
|
"reward": 1.4870903491973877,
|
|
"reward_std": 0.11480262130498886,
|
|
"rewards/accuracy_reward_long_step": 0.58203125,
|
|
"rewards/final_brier_reward_long_step": 0.7964640259742737,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.823772668838501,
|
|
"step": 579
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 448.0,
|
|
"completions/max_terminated_length": 448.0,
|
|
"completions/mean_length": 231.578125,
|
|
"completions/mean_terminated_length": 231.578125,
|
|
"completions/min_length": 92.0,
|
|
"completions/min_terminated_length": 92.0,
|
|
"epoch": 0.928,
|
|
"grad_norm": 0.04364067688584328,
|
|
"learning_rate": 8.00711743772242e-08,
|
|
"loss": 0.0038,
|
|
"num_tokens": 283710941.0,
|
|
"reward": 1.3739776611328125,
|
|
"reward_std": 0.0760713666677475,
|
|
"rewards/accuracy_reward_long_step": 0.453125,
|
|
"rewards/final_brier_reward_long_step": 0.8134101629257202,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8700001239776611,
|
|
"step": 580
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 444.0,
|
|
"completions/max_terminated_length": 444.0,
|
|
"completions/mean_length": 241.80078125,
|
|
"completions/mean_terminated_length": 241.80078125,
|
|
"completions/min_length": 100.0,
|
|
"completions/min_terminated_length": 100.0,
|
|
"epoch": 0.9296,
|
|
"grad_norm": 0.04623554274439812,
|
|
"learning_rate": 7.829181494661922e-08,
|
|
"loss": -0.0163,
|
|
"num_tokens": 284203962.0,
|
|
"reward": 1.3043211698532104,
|
|
"reward_std": 0.10367835313081741,
|
|
"rewards/accuracy_reward_long_step": 0.42578125,
|
|
"rewards/final_brier_reward_long_step": 0.7652456760406494,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7489140033721924,
|
|
"step": 581
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 665.0,
|
|
"completions/max_terminated_length": 665.0,
|
|
"completions/mean_length": 248.40234375,
|
|
"completions/mean_terminated_length": 248.40234375,
|
|
"completions/min_length": 142.0,
|
|
"completions/min_terminated_length": 142.0,
|
|
"epoch": 0.9312,
|
|
"grad_norm": 0.046936266124248505,
|
|
"learning_rate": 7.651245551601423e-08,
|
|
"loss": 0.0062,
|
|
"num_tokens": 284689161.0,
|
|
"reward": 1.2649556398391724,
|
|
"reward_std": 0.13359886407852173,
|
|
"rewards/accuracy_reward_long_step": 0.41015625,
|
|
"rewards/final_brier_reward_long_step": 0.6901007294654846,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7290970087051392,
|
|
"step": 582
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 409.0,
|
|
"completions/max_terminated_length": 409.0,
|
|
"completions/mean_length": 224.82421875,
|
|
"completions/mean_terminated_length": 224.82421875,
|
|
"completions/min_length": 118.0,
|
|
"completions/min_terminated_length": 118.0,
|
|
"epoch": 0.9328,
|
|
"grad_norm": 0.048148345202207565,
|
|
"learning_rate": 7.473309608540925e-08,
|
|
"loss": 0.0091,
|
|
"num_tokens": 285176220.0,
|
|
"reward": 1.545983076095581,
|
|
"reward_std": 0.10164255648851395,
|
|
"rewards/accuracy_reward_long_step": 0.64453125,
|
|
"rewards/final_brier_reward_long_step": 0.7637656331062317,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8420416116714478,
|
|
"step": 583
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 444.0,
|
|
"completions/max_terminated_length": 444.0,
|
|
"completions/mean_length": 234.75390625,
|
|
"completions/mean_terminated_length": 234.75390625,
|
|
"completions/min_length": 143.0,
|
|
"completions/min_terminated_length": 143.0,
|
|
"epoch": 0.9344,
|
|
"grad_norm": 0.0379473976790905,
|
|
"learning_rate": 7.295373665480427e-08,
|
|
"loss": -0.0063,
|
|
"num_tokens": 285682757.0,
|
|
"reward": 1.407658338546753,
|
|
"reward_std": 0.1226111352443695,
|
|
"rewards/accuracy_reward_long_step": 0.51953125,
|
|
"rewards/final_brier_reward_long_step": 0.7624057531356812,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7901023626327515,
|
|
"step": 584
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 509.0,
|
|
"completions/max_terminated_length": 509.0,
|
|
"completions/mean_length": 235.9140625,
|
|
"completions/mean_terminated_length": 235.9140625,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 0.936,
|
|
"grad_norm": 0.04213576763868332,
|
|
"learning_rate": 7.117437722419929e-08,
|
|
"loss": 0.0006,
|
|
"num_tokens": 286148327.0,
|
|
"reward": 1.3678431510925293,
|
|
"reward_std": 0.13030381500720978,
|
|
"rewards/accuracy_reward_long_step": 0.484375,
|
|
"rewards/final_brier_reward_long_step": 0.7432855367660522,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7905872464179993,
|
|
"step": 585
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 571.0,
|
|
"completions/max_terminated_length": 571.0,
|
|
"completions/mean_length": 239.28125,
|
|
"completions/mean_terminated_length": 239.28125,
|
|
"completions/min_length": 135.0,
|
|
"completions/min_terminated_length": 135.0,
|
|
"epoch": 0.9376,
|
|
"grad_norm": 0.039584312587976456,
|
|
"learning_rate": 6.93950177935943e-08,
|
|
"loss": -0.0023,
|
|
"num_tokens": 286619447.0,
|
|
"reward": 1.4525396823883057,
|
|
"reward_std": 0.1256856620311737,
|
|
"rewards/accuracy_reward_long_step": 0.5625,
|
|
"rewards/final_brier_reward_long_step": 0.7838070392608643,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.776351809501648,
|
|
"step": 586
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 472.0,
|
|
"completions/max_terminated_length": 472.0,
|
|
"completions/mean_length": 245.26953125,
|
|
"completions/mean_terminated_length": 245.26953125,
|
|
"completions/min_length": 86.0,
|
|
"completions/min_terminated_length": 86.0,
|
|
"epoch": 0.9392,
|
|
"grad_norm": 0.039651911705732346,
|
|
"learning_rate": 6.761565836298933e-08,
|
|
"loss": 0.0029,
|
|
"num_tokens": 287116388.0,
|
|
"reward": 1.378925085067749,
|
|
"reward_std": 0.12979546189308167,
|
|
"rewards/accuracy_reward_long_step": 0.49609375,
|
|
"rewards/final_brier_reward_long_step": 0.7602671980857849,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7710578441619873,
|
|
"step": 587
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 519.0,
|
|
"completions/max_terminated_length": 519.0,
|
|
"completions/mean_length": 232.0,
|
|
"completions/mean_terminated_length": 232.0,
|
|
"completions/min_length": 114.0,
|
|
"completions/min_terminated_length": 114.0,
|
|
"epoch": 0.9408,
|
|
"grad_norm": 0.05170515552163124,
|
|
"learning_rate": 6.583629893238433e-08,
|
|
"loss": -0.0055,
|
|
"num_tokens": 287604604.0,
|
|
"reward": 1.4521524906158447,
|
|
"reward_std": 0.19186797738075256,
|
|
"rewards/accuracy_reward_long_step": 0.55859375,
|
|
"rewards/final_brier_reward_long_step": 0.796981692314148,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.777253270149231,
|
|
"step": 588
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 446.0,
|
|
"completions/max_terminated_length": 446.0,
|
|
"completions/mean_length": 230.9609375,
|
|
"completions/mean_terminated_length": 230.9609375,
|
|
"completions/min_length": 128.0,
|
|
"completions/min_terminated_length": 128.0,
|
|
"epoch": 0.9424,
|
|
"grad_norm": 0.043422844260931015,
|
|
"learning_rate": 6.405693950177936e-08,
|
|
"loss": -0.0021,
|
|
"num_tokens": 288080906.0,
|
|
"reward": 1.5288910865783691,
|
|
"reward_std": 0.10271154344081879,
|
|
"rewards/accuracy_reward_long_step": 0.61328125,
|
|
"rewards/final_brier_reward_long_step": 0.816925048828125,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8455142974853516,
|
|
"step": 589
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 621.0,
|
|
"completions/max_terminated_length": 621.0,
|
|
"completions/mean_length": 236.06640625,
|
|
"completions/mean_terminated_length": 236.06640625,
|
|
"completions/min_length": 135.0,
|
|
"completions/min_terminated_length": 135.0,
|
|
"epoch": 0.944,
|
|
"grad_norm": 0.036451030522584915,
|
|
"learning_rate": 6.227758007117437e-08,
|
|
"loss": 0.0043,
|
|
"num_tokens": 288552851.0,
|
|
"reward": 1.5121817588806152,
|
|
"reward_std": 0.1267908215522766,
|
|
"rewards/accuracy_reward_long_step": 0.62109375,
|
|
"rewards/final_brier_reward_long_step": 0.7251984477043152,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8391537070274353,
|
|
"step": 590
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 439.0,
|
|
"completions/max_terminated_length": 439.0,
|
|
"completions/mean_length": 232.703125,
|
|
"completions/mean_terminated_length": 232.703125,
|
|
"completions/min_length": 134.0,
|
|
"completions/min_terminated_length": 134.0,
|
|
"epoch": 0.9456,
|
|
"grad_norm": 0.04694944620132446,
|
|
"learning_rate": 6.049822064056939e-08,
|
|
"loss": -0.0057,
|
|
"num_tokens": 289027695.0,
|
|
"reward": 1.4363982677459717,
|
|
"reward_std": 0.15798181295394897,
|
|
"rewards/accuracy_reward_long_step": 0.55078125,
|
|
"rewards/final_brier_reward_long_step": 0.7334683537483215,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.80899977684021,
|
|
"step": 591
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 516.0,
|
|
"completions/max_terminated_length": 516.0,
|
|
"completions/mean_length": 244.984375,
|
|
"completions/mean_terminated_length": 244.984375,
|
|
"completions/min_length": 140.0,
|
|
"completions/min_terminated_length": 140.0,
|
|
"epoch": 0.9472,
|
|
"grad_norm": 0.0597044937312603,
|
|
"learning_rate": 5.871886120996441e-08,
|
|
"loss": -0.016,
|
|
"num_tokens": 289507067.0,
|
|
"reward": 1.3889837265014648,
|
|
"reward_std": 0.08240145444869995,
|
|
"rewards/accuracy_reward_long_step": 0.45703125,
|
|
"rewards/final_brier_reward_long_step": 0.8605644702911377,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8672455549240112,
|
|
"step": 592
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 554.0,
|
|
"completions/max_terminated_length": 554.0,
|
|
"completions/mean_length": 233.28515625,
|
|
"completions/mean_terminated_length": 233.28515625,
|
|
"completions/min_length": 108.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.9488,
|
|
"grad_norm": 0.036467116326093674,
|
|
"learning_rate": 5.6939501779359424e-08,
|
|
"loss": 0.0026,
|
|
"num_tokens": 289991372.0,
|
|
"reward": 1.28902006149292,
|
|
"reward_std": 0.11750981956720352,
|
|
"rewards/accuracy_reward_long_step": 0.4375,
|
|
"rewards/final_brier_reward_long_step": 0.6445460915565491,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7615340352058411,
|
|
"step": 593
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 563.0,
|
|
"completions/max_terminated_length": 563.0,
|
|
"completions/mean_length": 227.6171875,
|
|
"completions/mean_terminated_length": 227.6171875,
|
|
"completions/min_length": 123.0,
|
|
"completions/min_terminated_length": 123.0,
|
|
"epoch": 0.9504,
|
|
"grad_norm": 0.09449837356805801,
|
|
"learning_rate": 5.516014234875445e-08,
|
|
"loss": 0.0093,
|
|
"num_tokens": 290484434.0,
|
|
"reward": 1.574216604232788,
|
|
"reward_std": 0.11178240180015564,
|
|
"rewards/accuracy_reward_long_step": 0.69140625,
|
|
"rewards/final_brier_reward_long_step": 0.7768968343734741,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7543442845344543,
|
|
"step": 594
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 527.0,
|
|
"completions/max_terminated_length": 527.0,
|
|
"completions/mean_length": 244.6015625,
|
|
"completions/mean_terminated_length": 244.6015625,
|
|
"completions/min_length": 127.0,
|
|
"completions/min_terminated_length": 127.0,
|
|
"epoch": 0.952,
|
|
"grad_norm": 0.03990272060036659,
|
|
"learning_rate": 5.3380782918149466e-08,
|
|
"loss": 0.0085,
|
|
"num_tokens": 290989700.0,
|
|
"reward": 1.508046269416809,
|
|
"reward_std": 0.10411694645881653,
|
|
"rewards/accuracy_reward_long_step": 0.6015625,
|
|
"rewards/final_brier_reward_long_step": 0.8094656467437744,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.824282169342041,
|
|
"step": 595
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 498.0,
|
|
"completions/max_terminated_length": 498.0,
|
|
"completions/mean_length": 258.6171875,
|
|
"completions/mean_terminated_length": 258.6171875,
|
|
"completions/min_length": 144.0,
|
|
"completions/min_terminated_length": 144.0,
|
|
"epoch": 0.9536,
|
|
"grad_norm": 0.035313017666339874,
|
|
"learning_rate": 5.160142348754448e-08,
|
|
"loss": 0.0136,
|
|
"num_tokens": 291475994.0,
|
|
"reward": 1.4012870788574219,
|
|
"reward_std": 0.130996972322464,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.8204870820045471,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7846609950065613,
|
|
"step": 596
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 506.0,
|
|
"completions/max_terminated_length": 506.0,
|
|
"completions/mean_length": 244.3515625,
|
|
"completions/mean_terminated_length": 244.3515625,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.9552,
|
|
"grad_norm": 0.04008813574910164,
|
|
"learning_rate": 4.98220640569395e-08,
|
|
"loss": 0.002,
|
|
"num_tokens": 291959996.0,
|
|
"reward": 1.3849501609802246,
|
|
"reward_std": 0.12151844799518585,
|
|
"rewards/accuracy_reward_long_step": 0.5234375,
|
|
"rewards/final_brier_reward_long_step": 0.7529085874557495,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.6931424140930176,
|
|
"step": 597
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 492.0,
|
|
"completions/max_terminated_length": 492.0,
|
|
"completions/mean_length": 226.16015625,
|
|
"completions/mean_terminated_length": 226.16015625,
|
|
"completions/min_length": 109.0,
|
|
"completions/min_terminated_length": 109.0,
|
|
"epoch": 0.9568,
|
|
"grad_norm": 0.04041390120983124,
|
|
"learning_rate": 4.8042704626334516e-08,
|
|
"loss": -0.0106,
|
|
"num_tokens": 292435517.0,
|
|
"reward": 1.3907995223999023,
|
|
"reward_std": 0.08410888910293579,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.7949777245521545,
|
|
"rewards/format_reward_long_step": 0.99609375,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7760331034660339,
|
|
"step": 598
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 580.0,
|
|
"completions/max_terminated_length": 580.0,
|
|
"completions/mean_length": 244.42578125,
|
|
"completions/mean_terminated_length": 244.42578125,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.9584,
|
|
"grad_norm": 0.042032286524772644,
|
|
"learning_rate": 4.626334519572953e-08,
|
|
"loss": 0.0048,
|
|
"num_tokens": 292911202.0,
|
|
"reward": 1.2448697090148926,
|
|
"reward_std": 0.10349094122648239,
|
|
"rewards/accuracy_reward_long_step": 0.375,
|
|
"rewards/final_brier_reward_long_step": 0.7020269632339478,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7774521112442017,
|
|
"step": 599
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 530.0,
|
|
"completions/max_terminated_length": 530.0,
|
|
"completions/mean_length": 230.69921875,
|
|
"completions/mean_terminated_length": 230.69921875,
|
|
"completions/min_length": 134.0,
|
|
"completions/min_terminated_length": 134.0,
|
|
"epoch": 0.96,
|
|
"grad_norm": 0.04338167607784271,
|
|
"learning_rate": 4.448398576512456e-08,
|
|
"loss": 0.0117,
|
|
"num_tokens": 293390637.0,
|
|
"reward": 1.550196886062622,
|
|
"reward_std": 0.1076919287443161,
|
|
"rewards/accuracy_reward_long_step": 0.6484375,
|
|
"rewards/final_brier_reward_long_step": 0.8495925664901733,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7574446201324463,
|
|
"step": 600
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 555.0,
|
|
"completions/max_terminated_length": 555.0,
|
|
"completions/mean_length": 242.64453125,
|
|
"completions/mean_terminated_length": 242.64453125,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.9616,
|
|
"grad_norm": 0.044204358011484146,
|
|
"learning_rate": 4.270462633451957e-08,
|
|
"loss": 0.0193,
|
|
"num_tokens": 293870786.0,
|
|
"reward": 1.3222875595092773,
|
|
"reward_std": 0.09255368262529373,
|
|
"rewards/accuracy_reward_long_step": 0.3984375,
|
|
"rewards/final_brier_reward_long_step": 0.8258058428764343,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8695942163467407,
|
|
"step": 601
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 463.0,
|
|
"completions/max_terminated_length": 463.0,
|
|
"completions/mean_length": 238.5390625,
|
|
"completions/mean_terminated_length": 238.5390625,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.9632,
|
|
"grad_norm": 0.04069282487034798,
|
|
"learning_rate": 4.092526690391459e-08,
|
|
"loss": -0.0003,
|
|
"num_tokens": 294354396.0,
|
|
"reward": 1.4837543964385986,
|
|
"reward_std": 0.1288076937198639,
|
|
"rewards/accuracy_reward_long_step": 0.5703125,
|
|
"rewards/final_brier_reward_long_step": 0.8007269501686096,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8530406355857849,
|
|
"step": 602
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 557.0,
|
|
"completions/max_terminated_length": 557.0,
|
|
"completions/mean_length": 241.36328125,
|
|
"completions/mean_terminated_length": 241.36328125,
|
|
"completions/min_length": 108.0,
|
|
"completions/min_terminated_length": 108.0,
|
|
"epoch": 0.9648,
|
|
"grad_norm": 0.0698588415980339,
|
|
"learning_rate": 3.914590747330961e-08,
|
|
"loss": -0.0062,
|
|
"num_tokens": 294831593.0,
|
|
"reward": 1.3811883926391602,
|
|
"reward_std": 0.12175668030977249,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.9002180099487305,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7495359182357788,
|
|
"step": 603
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 455.0,
|
|
"completions/max_terminated_length": 455.0,
|
|
"completions/mean_length": 228.00390625,
|
|
"completions/mean_terminated_length": 228.00390625,
|
|
"completions/min_length": 139.0,
|
|
"completions/min_terminated_length": 139.0,
|
|
"epoch": 0.9664,
|
|
"grad_norm": 0.04091455414891243,
|
|
"learning_rate": 3.736654804270462e-08,
|
|
"loss": 0.0007,
|
|
"num_tokens": 295312322.0,
|
|
"reward": 1.3806979656219482,
|
|
"reward_std": 0.1109173595905304,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.746747612953186,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7760441303253174,
|
|
"step": 604
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 456.0,
|
|
"completions/max_terminated_length": 456.0,
|
|
"completions/mean_length": 241.90234375,
|
|
"completions/mean_terminated_length": 241.90234375,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.968,
|
|
"grad_norm": 0.054420799016952515,
|
|
"learning_rate": 3.5587188612099644e-08,
|
|
"loss": 0.0025,
|
|
"num_tokens": 295800753.0,
|
|
"reward": 1.385452389717102,
|
|
"reward_std": 0.18200629949569702,
|
|
"rewards/accuracy_reward_long_step": 0.5234375,
|
|
"rewards/final_brier_reward_long_step": 0.6535894870758057,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7944704294204712,
|
|
"step": 605
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 474.0,
|
|
"completions/max_terminated_length": 474.0,
|
|
"completions/mean_length": 237.26171875,
|
|
"completions/mean_terminated_length": 237.26171875,
|
|
"completions/min_length": 125.0,
|
|
"completions/min_terminated_length": 125.0,
|
|
"epoch": 0.9696,
|
|
"grad_norm": 0.04651153087615967,
|
|
"learning_rate": 3.3807829181494665e-08,
|
|
"loss": 0.0115,
|
|
"num_tokens": 296282412.0,
|
|
"reward": 1.462805151939392,
|
|
"reward_std": 0.12919017672538757,
|
|
"rewards/accuracy_reward_long_step": 0.6015625,
|
|
"rewards/final_brier_reward_long_step": 0.685794472694397,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7591761350631714,
|
|
"step": 606
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 517.0,
|
|
"completions/max_terminated_length": 517.0,
|
|
"completions/mean_length": 231.671875,
|
|
"completions/mean_terminated_length": 231.671875,
|
|
"completions/min_length": 123.0,
|
|
"completions/min_terminated_length": 123.0,
|
|
"epoch": 0.9712,
|
|
"grad_norm": 0.046628162264823914,
|
|
"learning_rate": 3.202846975088968e-08,
|
|
"loss": 0.0057,
|
|
"num_tokens": 296766752.0,
|
|
"reward": 1.5526325702667236,
|
|
"reward_std": 0.08376991003751755,
|
|
"rewards/accuracy_reward_long_step": 0.62890625,
|
|
"rewards/final_brier_reward_long_step": 0.8181566596031189,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8767487406730652,
|
|
"step": 607
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 499.0,
|
|
"completions/max_terminated_length": 499.0,
|
|
"completions/mean_length": 244.3359375,
|
|
"completions/mean_terminated_length": 244.3359375,
|
|
"completions/min_length": 117.0,
|
|
"completions/min_terminated_length": 117.0,
|
|
"epoch": 0.9728,
|
|
"grad_norm": 0.045422233641147614,
|
|
"learning_rate": 3.0249110320284694e-08,
|
|
"loss": 0.0073,
|
|
"num_tokens": 297252086.0,
|
|
"reward": 1.3045486211776733,
|
|
"reward_std": 0.10245119035243988,
|
|
"rewards/accuracy_reward_long_step": 0.4140625,
|
|
"rewards/final_brier_reward_long_step": 0.7651957273483276,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7967486381530762,
|
|
"step": 608
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 624.0,
|
|
"completions/max_terminated_length": 624.0,
|
|
"completions/mean_length": 243.4375,
|
|
"completions/mean_terminated_length": 243.4375,
|
|
"completions/min_length": 140.0,
|
|
"completions/min_terminated_length": 140.0,
|
|
"epoch": 0.9744,
|
|
"grad_norm": 0.040375716984272,
|
|
"learning_rate": 2.8469750889679712e-08,
|
|
"loss": 0.013,
|
|
"num_tokens": 297745582.0,
|
|
"reward": 1.4966931343078613,
|
|
"reward_std": 0.092777319252491,
|
|
"rewards/accuracy_reward_long_step": 0.57421875,
|
|
"rewards/final_brier_reward_long_step": 0.837005078792572,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8528923988342285,
|
|
"step": 609
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 625.0,
|
|
"completions/max_terminated_length": 625.0,
|
|
"completions/mean_length": 242.9765625,
|
|
"completions/mean_terminated_length": 242.9765625,
|
|
"completions/min_length": 137.0,
|
|
"completions/min_terminated_length": 137.0,
|
|
"epoch": 0.976,
|
|
"grad_norm": 0.043781962245702744,
|
|
"learning_rate": 2.6690391459074733e-08,
|
|
"loss": 0.006,
|
|
"num_tokens": 298209824.0,
|
|
"reward": 1.462713360786438,
|
|
"reward_std": 0.14046858251094818,
|
|
"rewards/accuracy_reward_long_step": 0.56640625,
|
|
"rewards/final_brier_reward_long_step": 0.7839847803115845,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8012436628341675,
|
|
"step": 610
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 439.0,
|
|
"completions/max_terminated_length": 439.0,
|
|
"completions/mean_length": 224.9609375,
|
|
"completions/mean_terminated_length": 224.9609375,
|
|
"completions/min_length": 122.0,
|
|
"completions/min_terminated_length": 122.0,
|
|
"epoch": 0.9776,
|
|
"grad_norm": 0.03858632594347,
|
|
"learning_rate": 2.491103202846975e-08,
|
|
"loss": 0.0032,
|
|
"num_tokens": 298700686.0,
|
|
"reward": 1.3872606754302979,
|
|
"reward_std": 0.07525929063558578,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.740240216255188,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8088023662567139,
|
|
"step": 611
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 446.0,
|
|
"completions/max_terminated_length": 446.0,
|
|
"completions/mean_length": 233.19140625,
|
|
"completions/mean_terminated_length": 233.19140625,
|
|
"completions/min_length": 100.0,
|
|
"completions/min_terminated_length": 100.0,
|
|
"epoch": 0.9792,
|
|
"grad_norm": 0.04854563623666763,
|
|
"learning_rate": 2.3131672597864765e-08,
|
|
"loss": -0.007,
|
|
"num_tokens": 299182631.0,
|
|
"reward": 1.2626285552978516,
|
|
"reward_std": 0.09427875280380249,
|
|
"rewards/accuracy_reward_long_step": 0.37890625,
|
|
"rewards/final_brier_reward_long_step": 0.7738581895828247,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7610312700271606,
|
|
"step": 612
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 438.0,
|
|
"completions/max_terminated_length": 438.0,
|
|
"completions/mean_length": 232.41796875,
|
|
"completions/mean_terminated_length": 232.41796875,
|
|
"completions/min_length": 92.0,
|
|
"completions/min_terminated_length": 92.0,
|
|
"epoch": 0.9808,
|
|
"grad_norm": 0.04861883446574211,
|
|
"learning_rate": 2.1352313167259786e-08,
|
|
"loss": -0.0034,
|
|
"num_tokens": 299679130.0,
|
|
"reward": 1.4753010272979736,
|
|
"reward_std": 0.07958254218101501,
|
|
"rewards/accuracy_reward_long_step": 0.5546875,
|
|
"rewards/final_brier_reward_long_step": 0.8655683994293213,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8168855905532837,
|
|
"step": 613
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 385.0,
|
|
"completions/max_terminated_length": 385.0,
|
|
"completions/mean_length": 222.76953125,
|
|
"completions/mean_terminated_length": 222.76953125,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 0.9824,
|
|
"grad_norm": 0.04186774417757988,
|
|
"learning_rate": 1.9572953736654804e-08,
|
|
"loss": -0.0007,
|
|
"num_tokens": 300167079.0,
|
|
"reward": 1.3874082565307617,
|
|
"reward_std": 0.0872558057308197,
|
|
"rewards/accuracy_reward_long_step": 0.5234375,
|
|
"rewards/final_brier_reward_long_step": 0.6611804366111755,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7947026491165161,
|
|
"step": 614
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 704.0,
|
|
"completions/max_terminated_length": 704.0,
|
|
"completions/mean_length": 254.33984375,
|
|
"completions/mean_terminated_length": 254.33984375,
|
|
"completions/min_length": 157.0,
|
|
"completions/min_terminated_length": 157.0,
|
|
"epoch": 0.984,
|
|
"grad_norm": 0.0385391041636467,
|
|
"learning_rate": 1.7793594306049822e-08,
|
|
"loss": 0.0161,
|
|
"num_tokens": 300662414.0,
|
|
"reward": 1.3372551202774048,
|
|
"reward_std": 0.15426021814346313,
|
|
"rewards/accuracy_reward_long_step": 0.4296875,
|
|
"rewards/final_brier_reward_long_step": 0.7895093560218811,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8407611846923828,
|
|
"step": 615
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 507.0,
|
|
"completions/max_terminated_length": 507.0,
|
|
"completions/mean_length": 250.22265625,
|
|
"completions/mean_terminated_length": 250.22265625,
|
|
"completions/min_length": 124.0,
|
|
"completions/min_terminated_length": 124.0,
|
|
"epoch": 0.9856,
|
|
"grad_norm": 0.03464104235172272,
|
|
"learning_rate": 1.601423487544484e-08,
|
|
"loss": 0.0092,
|
|
"num_tokens": 301154455.0,
|
|
"reward": 1.3746635913848877,
|
|
"reward_std": 0.11654820293188095,
|
|
"rewards/accuracy_reward_long_step": 0.46875,
|
|
"rewards/final_brier_reward_long_step": 0.7886804342269897,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8349737524986267,
|
|
"step": 616
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 416.0,
|
|
"completions/max_terminated_length": 416.0,
|
|
"completions/mean_length": 237.05078125,
|
|
"completions/mean_terminated_length": 237.05078125,
|
|
"completions/min_length": 121.0,
|
|
"completions/min_terminated_length": 121.0,
|
|
"epoch": 0.9872,
|
|
"grad_norm": 0.05452824756503105,
|
|
"learning_rate": 1.4234875444839856e-08,
|
|
"loss": 0.0012,
|
|
"num_tokens": 301632876.0,
|
|
"reward": 1.377845287322998,
|
|
"reward_std": 0.12956568598747253,
|
|
"rewards/accuracy_reward_long_step": 0.49609375,
|
|
"rewards/final_brier_reward_long_step": 0.7508812546730042,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7761249542236328,
|
|
"step": 617
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 467.0,
|
|
"completions/max_terminated_length": 467.0,
|
|
"completions/mean_length": 234.171875,
|
|
"completions/mean_terminated_length": 234.171875,
|
|
"completions/min_length": 126.0,
|
|
"completions/min_terminated_length": 126.0,
|
|
"epoch": 0.9888,
|
|
"grad_norm": 0.04497021064162254,
|
|
"learning_rate": 1.2455516014234875e-08,
|
|
"loss": -0.003,
|
|
"num_tokens": 302116440.0,
|
|
"reward": 1.360137701034546,
|
|
"reward_std": 0.2063150405883789,
|
|
"rewards/accuracy_reward_long_step": 0.5,
|
|
"rewards/final_brier_reward_long_step": 0.7225565910339355,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7179945111274719,
|
|
"step": 618
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 425.0,
|
|
"completions/max_terminated_length": 425.0,
|
|
"completions/mean_length": 239.23828125,
|
|
"completions/mean_terminated_length": 239.23828125,
|
|
"completions/min_length": 131.0,
|
|
"completions/min_terminated_length": 131.0,
|
|
"epoch": 0.9904,
|
|
"grad_norm": 0.038744006305933,
|
|
"learning_rate": 1.0676156583629893e-08,
|
|
"loss": -0.0001,
|
|
"num_tokens": 302594637.0,
|
|
"reward": 1.3021314144134521,
|
|
"reward_std": 0.11754617094993591,
|
|
"rewards/accuracy_reward_long_step": 0.41796875,
|
|
"rewards/final_brier_reward_long_step": 0.7479242086410522,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7887262105941772,
|
|
"step": 619
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 417.0,
|
|
"completions/max_terminated_length": 417.0,
|
|
"completions/mean_length": 228.94140625,
|
|
"completions/mean_terminated_length": 228.94140625,
|
|
"completions/min_length": 132.0,
|
|
"completions/min_terminated_length": 132.0,
|
|
"epoch": 0.992,
|
|
"grad_norm": 0.035442836582660675,
|
|
"learning_rate": 8.896797153024911e-09,
|
|
"loss": 0.0013,
|
|
"num_tokens": 303086934.0,
|
|
"reward": 1.61018705368042,
|
|
"reward_std": 0.09774182736873627,
|
|
"rewards/accuracy_reward_long_step": 0.71484375,
|
|
"rewards/final_brier_reward_long_step": 0.7831144332885742,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7982592582702637,
|
|
"step": 620
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 371.0,
|
|
"completions/max_terminated_length": 371.0,
|
|
"completions/mean_length": 224.3203125,
|
|
"completions/mean_terminated_length": 224.3203125,
|
|
"completions/min_length": 137.0,
|
|
"completions/min_terminated_length": 137.0,
|
|
"epoch": 0.9936,
|
|
"grad_norm": 0.05612906068563461,
|
|
"learning_rate": 7.117437722419928e-09,
|
|
"loss": 0.0006,
|
|
"num_tokens": 303565288.0,
|
|
"reward": 1.5199366807937622,
|
|
"reward_std": 0.0858568549156189,
|
|
"rewards/accuracy_reward_long_step": 0.62109375,
|
|
"rewards/final_brier_reward_long_step": 0.7536445260047913,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8417270183563232,
|
|
"step": 621
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 556.0,
|
|
"completions/max_terminated_length": 556.0,
|
|
"completions/mean_length": 244.44921875,
|
|
"completions/mean_terminated_length": 244.44921875,
|
|
"completions/min_length": 159.0,
|
|
"completions/min_terminated_length": 159.0,
|
|
"epoch": 0.9952,
|
|
"grad_norm": 0.05516738444566727,
|
|
"learning_rate": 5.338078291814947e-09,
|
|
"loss": 0.0138,
|
|
"num_tokens": 304051859.0,
|
|
"reward": 1.3782299757003784,
|
|
"reward_std": 0.11134977638721466,
|
|
"rewards/accuracy_reward_long_step": 0.4765625,
|
|
"rewards/final_brier_reward_long_step": 0.770743727684021,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8359257578849792,
|
|
"step": 622
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 415.0,
|
|
"completions/max_terminated_length": 415.0,
|
|
"completions/mean_length": 227.07421875,
|
|
"completions/mean_terminated_length": 227.07421875,
|
|
"completions/min_length": 112.0,
|
|
"completions/min_terminated_length": 112.0,
|
|
"epoch": 0.9968,
|
|
"grad_norm": 0.04374608024954796,
|
|
"learning_rate": 3.558718861209964e-09,
|
|
"loss": -0.0019,
|
|
"num_tokens": 304545614.0,
|
|
"reward": 1.507850170135498,
|
|
"reward_std": 0.16186021268367767,
|
|
"rewards/accuracy_reward_long_step": 0.6171875,
|
|
"rewards/final_brier_reward_long_step": 0.7282167673110962,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8344339728355408,
|
|
"step": 623
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 519.0,
|
|
"completions/max_terminated_length": 519.0,
|
|
"completions/mean_length": 238.9609375,
|
|
"completions/mean_terminated_length": 238.9609375,
|
|
"completions/min_length": 84.0,
|
|
"completions/min_terminated_length": 84.0,
|
|
"epoch": 0.9984,
|
|
"grad_norm": 0.042679328471422195,
|
|
"learning_rate": 1.779359430604982e-09,
|
|
"loss": 0.0036,
|
|
"num_tokens": 305024668.0,
|
|
"reward": 1.3625683784484863,
|
|
"reward_std": 0.12260974198579788,
|
|
"rewards/accuracy_reward_long_step": 0.48828125,
|
|
"rewards/final_brier_reward_long_step": 0.7129184007644653,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.7842304706573486,
|
|
"step": 624
|
|
},
|
|
{
|
|
"calib/answer_extract_rate": 0.0,
|
|
"calib/avg_num_step_conf": 0.0,
|
|
"calib/final_conf_rate": 0.0,
|
|
"calib/format_rate": 0.0,
|
|
"calib/nonempty_final_conf_rate": 0.0,
|
|
"calib/nonempty_reasoning_rate": 0.0,
|
|
"calib/nonempty_step_conf_rate": 0.0,
|
|
"calib/step_conf_rate": 0.0,
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 465.0,
|
|
"completions/max_terminated_length": 465.0,
|
|
"completions/mean_length": 234.5,
|
|
"completions/mean_terminated_length": 234.5,
|
|
"completions/min_length": 159.0,
|
|
"completions/min_terminated_length": 159.0,
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.04114016145467758,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.017,
|
|
"num_tokens": 305506462.0,
|
|
"reward": 1.5832818746566772,
|
|
"reward_std": 0.10286815464496613,
|
|
"rewards/accuracy_reward_long_step": 0.6875,
|
|
"rewards/final_brier_reward_long_step": 0.7612390518188477,
|
|
"rewards/format_reward_long_step": 1.0,
|
|
"rewards/stepwise_brier_reward_long_step": 0.8218887448310852,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 625,
|
|
"total_flos": 0.0,
|
|
"train_loss": -0.007067593541555107,
|
|
"train_runtime": 30966.4861,
|
|
"train_samples_per_second": 0.646,
|
|
"train_steps_per_second": 0.02
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 625,
|
|
"num_input_tokens_seen": 305506462,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 20,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|