2264 lines
83 KiB
JSON
2264 lines
83 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.9984,
|
|
"eval_steps": 15,
|
|
"global_step": 78,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0595703125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 932.0,
|
|
"completions/mean_length": 252.85693359375,
|
|
"completions/mean_terminated_length": 204.00987243652344,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0128,
|
|
"grad_norm": 0.01206118892878294,
|
|
"learning_rate": 2.5e-07,
|
|
"loss": 0.062,
|
|
"num_tokens": 3474099.0,
|
|
"reward": 0.646336019039154,
|
|
"reward_std": 0.503198504447937,
|
|
"rewards/accuracy_reward": 0.24609375,
|
|
"rewards/brier_reward": 0.37909579277038574,
|
|
"rewards/confidence_one_or_zero": 0.26953125,
|
|
"rewards/format_reward": 0.66748046875,
|
|
"rewards/mean_confidence_reward": 0.7516889572143555,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0556640625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 995.0,
|
|
"completions/mean_length": 257.6103515625,
|
|
"completions/mean_terminated_length": 212.43536376953125,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0256,
|
|
"grad_norm": 0.05630214512348175,
|
|
"learning_rate": 5e-07,
|
|
"loss": 0.06,
|
|
"num_tokens": 7050869.0,
|
|
"reward": 0.5985734462738037,
|
|
"reward_std": 0.4480513036251068,
|
|
"rewards/accuracy_reward": 0.171875,
|
|
"rewards/brier_reward": 0.3416762948036194,
|
|
"rewards/confidence_one_or_zero": 0.25537109375,
|
|
"rewards/format_reward": 0.68359375,
|
|
"rewards/mean_confidence_reward": 0.7263393402099609,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.052734375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 254.6923828125,
|
|
"completions/mean_terminated_length": 211.86495971679688,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0384,
|
|
"grad_norm": 0.02448289282619953,
|
|
"learning_rate": 7.5e-07,
|
|
"loss": 0.0532,
|
|
"num_tokens": 10556279.0,
|
|
"reward": 0.6241230964660645,
|
|
"reward_std": 0.45209184288978577,
|
|
"rewards/accuracy_reward": 0.203125,
|
|
"rewards/brier_reward": 0.36250197887420654,
|
|
"rewards/confidence_one_or_zero": 0.26611328125,
|
|
"rewards/format_reward": 0.6826171875,
|
|
"rewards/mean_confidence_reward": 0.7384810447692871,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.05224609375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1021.0,
|
|
"completions/mean_length": 255.78857421875,
|
|
"completions/mean_terminated_length": 213.43997192382812,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0512,
|
|
"grad_norm": 0.009398565627634525,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0588,
|
|
"num_tokens": 14071174.0,
|
|
"reward": 0.625983715057373,
|
|
"reward_std": 0.46379736065864563,
|
|
"rewards/accuracy_reward": 0.1982421875,
|
|
"rewards/brier_reward": 0.3657350540161133,
|
|
"rewards/confidence_one_or_zero": 0.271484375,
|
|
"rewards/format_reward": 0.68798828125,
|
|
"rewards/mean_confidence_reward": 0.7340266704559326,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.04931640625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1012.0,
|
|
"completions/mean_length": 250.24267578125,
|
|
"completions/mean_terminated_length": 210.104248046875,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.064,
|
|
"grad_norm": 0.010432912968099117,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0609,
|
|
"num_tokens": 17581279.0,
|
|
"reward": 0.6377379894256592,
|
|
"reward_std": 0.4767524302005768,
|
|
"rewards/accuracy_reward": 0.2119140625,
|
|
"rewards/brier_reward": 0.37361857295036316,
|
|
"rewards/confidence_one_or_zero": 0.2607421875,
|
|
"rewards/format_reward": 0.68994140625,
|
|
"rewards/mean_confidence_reward": 0.72353196144104,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0419921875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 983.0,
|
|
"completions/mean_length": 233.49072265625,
|
|
"completions/mean_terminated_length": 198.8404541015625,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0768,
|
|
"grad_norm": 0.03446255251765251,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0446,
|
|
"num_tokens": 21031396.0,
|
|
"reward": 0.7123662233352661,
|
|
"reward_std": 0.4492800533771515,
|
|
"rewards/accuracy_reward": 0.2568359375,
|
|
"rewards/brier_reward": 0.41545307636260986,
|
|
"rewards/confidence_one_or_zero": 0.2783203125,
|
|
"rewards/format_reward": 0.75244140625,
|
|
"rewards/mean_confidence_reward": 0.7652001976966858,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0322265625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1020.0,
|
|
"completions/mean_length": 218.8916015625,
|
|
"completions/mean_terminated_length": 192.08172607421875,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.0896,
|
|
"grad_norm": 0.03061598353087902,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.043,
|
|
"num_tokens": 24512238.0,
|
|
"reward": 0.7533546686172485,
|
|
"reward_std": 0.40088847279548645,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/brier_reward": 0.43981271982192993,
|
|
"rewards/confidence_one_or_zero": 0.259765625,
|
|
"rewards/format_reward": 0.81689453125,
|
|
"rewards/mean_confidence_reward": 0.7687591314315796,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.017578125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1009.0,
|
|
"completions/mean_length": 187.7744140625,
|
|
"completions/mean_terminated_length": 172.81211853027344,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.1024,
|
|
"grad_norm": 0.014748022891581059,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0338,
|
|
"num_tokens": 27904744.0,
|
|
"reward": 0.8088976144790649,
|
|
"reward_std": 0.3649292290210724,
|
|
"rewards/accuracy_reward": 0.26953125,
|
|
"rewards/brier_reward": 0.46837902069091797,
|
|
"rewards/confidence_one_or_zero": 0.2646484375,
|
|
"rewards/format_reward": 0.8798828125,
|
|
"rewards/mean_confidence_reward": 0.7798730134963989,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 945.0,
|
|
"completions/mean_length": 175.26171875,
|
|
"completions/mean_terminated_length": 161.78968811035156,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.1152,
|
|
"grad_norm": 0.1248084306716919,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0267,
|
|
"num_tokens": 31256936.0,
|
|
"reward": 0.8849074840545654,
|
|
"reward_std": 0.3317580819129944,
|
|
"rewards/accuracy_reward": 0.32470703125,
|
|
"rewards/brier_reward": 0.5227425694465637,
|
|
"rewards/confidence_one_or_zero": 0.2705078125,
|
|
"rewards/format_reward": 0.92236328125,
|
|
"rewards/mean_confidence_reward": 0.7816927433013916,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.015625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 630.0,
|
|
"completions/mean_length": 162.44384765625,
|
|
"completions/mean_terminated_length": 148.7683563232422,
|
|
"completions/min_length": 5.0,
|
|
"completions/min_terminated_length": 5.0,
|
|
"epoch": 0.128,
|
|
"grad_norm": 0.013320323079824448,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.022,
|
|
"num_tokens": 34584917.0,
|
|
"reward": 0.8880295753479004,
|
|
"reward_std": 0.3083075284957886,
|
|
"rewards/accuracy_reward": 0.30712890625,
|
|
"rewards/brier_reward": 0.5314282178878784,
|
|
"rewards/confidence_one_or_zero": 0.2568359375,
|
|
"rewards/format_reward": 0.9375,
|
|
"rewards/mean_confidence_reward": 0.7687011957168579,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00927734375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 541.0,
|
|
"completions/mean_length": 149.92822265625,
|
|
"completions/mean_terminated_length": 141.74322509765625,
|
|
"completions/min_length": 13.0,
|
|
"completions/min_terminated_length": 13.0,
|
|
"epoch": 0.1408,
|
|
"grad_norm": 0.009336220100522041,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0137,
|
|
"num_tokens": 37879610.0,
|
|
"reward": 0.9682935476303101,
|
|
"reward_std": 0.30119815468788147,
|
|
"rewards/accuracy_reward": 0.39697265625,
|
|
"rewards/brier_reward": 0.5859990119934082,
|
|
"rewards/confidence_one_or_zero": 0.27587890625,
|
|
"rewards/format_reward": 0.95361328125,
|
|
"rewards/mean_confidence_reward": 0.7835807204246521,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.005859375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1000.0,
|
|
"completions/mean_length": 148.54931640625,
|
|
"completions/mean_terminated_length": 143.38949584960938,
|
|
"completions/min_length": 10.0,
|
|
"completions/min_terminated_length": 10.0,
|
|
"epoch": 0.1536,
|
|
"grad_norm": 0.01234795618802309,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0109,
|
|
"num_tokens": 41182895.0,
|
|
"reward": 0.9633985757827759,
|
|
"reward_std": 0.27885445952415466,
|
|
"rewards/accuracy_reward": 0.36572265625,
|
|
"rewards/brier_reward": 0.588416337966919,
|
|
"rewards/confidence_one_or_zero": 0.21923828125,
|
|
"rewards/format_reward": 0.97265625,
|
|
"rewards/mean_confidence_reward": 0.7653173804283142,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0029296875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 804.0,
|
|
"completions/mean_length": 134.34326171875,
|
|
"completions/mean_terminated_length": 131.72918701171875,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"epoch": 0.1664,
|
|
"grad_norm": 0.011425751261413097,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0041,
|
|
"num_tokens": 44485246.0,
|
|
"reward": 0.9407795071601868,
|
|
"reward_std": 0.2531838119029999,
|
|
"rewards/accuracy_reward": 0.32666015625,
|
|
"rewards/brier_reward": 0.5758930444717407,
|
|
"rewards/confidence_one_or_zero": 0.20947265625,
|
|
"rewards/format_reward": 0.97900390625,
|
|
"rewards/mean_confidence_reward": 0.75456702709198,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0029296875,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 1018.0,
|
|
"completions/mean_length": 138.37548828125,
|
|
"completions/mean_terminated_length": 135.77325439453125,
|
|
"completions/min_length": 1.0,
|
|
"completions/min_terminated_length": 1.0,
|
|
"epoch": 0.1792,
|
|
"grad_norm": 0.004313925746828318,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0035,
|
|
"num_tokens": 47816527.0,
|
|
"reward": 0.9434206485748291,
|
|
"reward_std": 0.24209100008010864,
|
|
"rewards/accuracy_reward": 0.31884765625,
|
|
"rewards/brier_reward": 0.5865465402603149,
|
|
"rewards/confidence_one_or_zero": 0.15283203125,
|
|
"rewards/format_reward": 0.9814453125,
|
|
"rewards/mean_confidence_reward": 0.7383813858032227,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00439453125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 741.0,
|
|
"completions/mean_length": 133.2216796875,
|
|
"completions/mean_terminated_length": 129.28985595703125,
|
|
"completions/min_length": 42.0,
|
|
"completions/min_terminated_length": 42.0,
|
|
"epoch": 0.192,
|
|
"grad_norm": 0.00784427672624588,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.006,
|
|
"num_tokens": 51033389.0,
|
|
"reward": 0.983768105506897,
|
|
"reward_std": 0.24871209263801575,
|
|
"rewards/accuracy_reward": 0.361328125,
|
|
"rewards/brier_reward": 0.623296320438385,
|
|
"rewards/confidence_one_or_zero": 0.1484375,
|
|
"rewards/format_reward": 0.98291015625,
|
|
"rewards/mean_confidence_reward": 0.7346706986427307,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.192,
|
|
"eval_completions/clipped_ratio": 0.00390625,
|
|
"eval_completions/max_length": 675.0,
|
|
"eval_completions/max_terminated_length": 347.5,
|
|
"eval_completions/mean_length": 134.63638305664062,
|
|
"eval_completions/mean_terminated_length": 131.14013671875,
|
|
"eval_completions/min_length": 56.0,
|
|
"eval_completions/min_terminated_length": 56.0,
|
|
"eval_loss": 0.0,
|
|
"eval_num_tokens": 51033389.0,
|
|
"eval_reward": 0.9371011257171631,
|
|
"eval_reward_std": 0.3611362501978874,
|
|
"eval_rewards/accuracy_reward": 0.30078125,
|
|
"eval_rewards/brier_reward": 0.5890443176031113,
|
|
"eval_rewards/confidence_one_or_zero": 0.12109375,
|
|
"eval_rewards/format_reward": 0.984375,
|
|
"eval_rewards/mean_confidence_reward": 0.712636724114418,
|
|
"eval_runtime": 50.8118,
|
|
"eval_samples_per_second": 9.84,
|
|
"eval_steps_per_second": 0.079,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00244140625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 865.0,
|
|
"completions/mean_length": 136.1689453125,
|
|
"completions/mean_terminated_length": 133.99607849121094,
|
|
"completions/min_length": 41.0,
|
|
"completions/min_terminated_length": 41.0,
|
|
"epoch": 0.2048,
|
|
"grad_norm": 0.015799518674612045,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 54328887.0,
|
|
"reward": 0.9970545768737793,
|
|
"reward_std": 0.23557817935943604,
|
|
"rewards/accuracy_reward": 0.38330078125,
|
|
"rewards/brier_reward": 0.6230137348175049,
|
|
"rewards/confidence_one_or_zero": 0.13623046875,
|
|
"rewards/format_reward": 0.98779296875,
|
|
"rewards/mean_confidence_reward": 0.7369694113731384,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00244140625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 586.0,
|
|
"completions/mean_length": 130.96484375,
|
|
"completions/mean_terminated_length": 128.7792510986328,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"epoch": 0.2176,
|
|
"grad_norm": 0.011801626533269882,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0024,
|
|
"num_tokens": 57602287.0,
|
|
"reward": 0.9894813299179077,
|
|
"reward_std": 0.22355304658412933,
|
|
"rewards/accuracy_reward": 0.35205078125,
|
|
"rewards/brier_reward": 0.6386289596557617,
|
|
"rewards/confidence_one_or_zero": 0.13720703125,
|
|
"rewards/format_reward": 0.98828125,
|
|
"rewards/mean_confidence_reward": 0.7070361375808716,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00244140625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 684.0,
|
|
"completions/mean_length": 133.64404296875,
|
|
"completions/mean_terminated_length": 131.46499633789062,
|
|
"completions/min_length": 2.0,
|
|
"completions/min_terminated_length": 2.0,
|
|
"epoch": 0.2304,
|
|
"grad_norm": 0.003088222583755851,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0024,
|
|
"num_tokens": 60943718.0,
|
|
"reward": 1.0247716903686523,
|
|
"reward_std": 0.2183222770690918,
|
|
"rewards/accuracy_reward": 0.39892578125,
|
|
"rewards/brier_reward": 0.6613582372665405,
|
|
"rewards/confidence_one_or_zero": 0.11376953125,
|
|
"rewards/format_reward": 0.9892578125,
|
|
"rewards/mean_confidence_reward": 0.7037431001663208,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.001953125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 684.0,
|
|
"completions/mean_length": 130.42626953125,
|
|
"completions/mean_terminated_length": 128.67759704589844,
|
|
"completions/min_length": 39.0,
|
|
"completions/min_terminated_length": 39.0,
|
|
"epoch": 0.2432,
|
|
"grad_norm": 0.0030834106728434563,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.001,
|
|
"num_tokens": 64218495.0,
|
|
"reward": 1.0376394987106323,
|
|
"reward_std": 0.22437486052513123,
|
|
"rewards/accuracy_reward": 0.40869140625,
|
|
"rewards/brier_reward": 0.6768399477005005,
|
|
"rewards/confidence_one_or_zero": 0.10986328125,
|
|
"rewards/format_reward": 0.98974609375,
|
|
"rewards/mean_confidence_reward": 0.6858630180358887,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0009765625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 605.0,
|
|
"completions/mean_length": 129.1884765625,
|
|
"completions/mean_terminated_length": 128.31378173828125,
|
|
"completions/min_length": 45.0,
|
|
"completions/min_terminated_length": 45.0,
|
|
"epoch": 0.256,
|
|
"grad_norm": 0.0019296990940347314,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0007,
|
|
"num_tokens": 67508849.0,
|
|
"reward": 1.0304187536239624,
|
|
"reward_std": 0.20604413747787476,
|
|
"rewards/accuracy_reward": 0.39013671875,
|
|
"rewards/brier_reward": 0.6755821108818054,
|
|
"rewards/confidence_one_or_zero": 0.0966796875,
|
|
"rewards/format_reward": 0.9951171875,
|
|
"rewards/mean_confidence_reward": 0.6770117282867432,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00048828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 919.0,
|
|
"completions/mean_length": 134.54052734375,
|
|
"completions/mean_terminated_length": 134.1060028076172,
|
|
"completions/min_length": 50.0,
|
|
"completions/min_terminated_length": 50.0,
|
|
"epoch": 0.2688,
|
|
"grad_norm": 0.0030771668534725904,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0013,
|
|
"num_tokens": 70787884.0,
|
|
"reward": 1.0380725860595703,
|
|
"reward_std": 0.197679340839386,
|
|
"rewards/accuracy_reward": 0.39208984375,
|
|
"rewards/brier_reward": 0.6879599690437317,
|
|
"rewards/confidence_one_or_zero": 0.0791015625,
|
|
"rewards/format_reward": 0.99609375,
|
|
"rewards/mean_confidence_reward": 0.6546210646629333,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00146484375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 766.0,
|
|
"completions/mean_length": 131.81640625,
|
|
"completions/mean_terminated_length": 130.50758361816406,
|
|
"completions/min_length": 37.0,
|
|
"completions/min_terminated_length": 37.0,
|
|
"epoch": 0.2816,
|
|
"grad_norm": 0.001605757512152195,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0016,
|
|
"num_tokens": 74049700.0,
|
|
"reward": 1.0163131952285767,
|
|
"reward_std": 0.19691747426986694,
|
|
"rewards/accuracy_reward": 0.35302734375,
|
|
"rewards/brier_reward": 0.6854572296142578,
|
|
"rewards/confidence_one_or_zero": 0.076171875,
|
|
"rewards/format_reward": 0.994140625,
|
|
"rewards/mean_confidence_reward": 0.6287341713905334,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 470.0,
|
|
"completions/max_terminated_length": 470.0,
|
|
"completions/mean_length": 130.5732421875,
|
|
"completions/mean_terminated_length": 130.5732421875,
|
|
"completions/min_length": 51.0,
|
|
"completions/min_terminated_length": 51.0,
|
|
"epoch": 0.2944,
|
|
"grad_norm": 0.0016451003029942513,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0007,
|
|
"num_tokens": 77286914.0,
|
|
"reward": 1.052234411239624,
|
|
"reward_std": 0.19646784663200378,
|
|
"rewards/accuracy_reward": 0.39404296875,
|
|
"rewards/brier_reward": 0.7128661274909973,
|
|
"rewards/confidence_one_or_zero": 0.06005859375,
|
|
"rewards/format_reward": 0.99755859375,
|
|
"rewards/mean_confidence_reward": 0.6058691143989563,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0009765625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 552.0,
|
|
"completions/mean_length": 135.2314453125,
|
|
"completions/mean_terminated_length": 134.36265563964844,
|
|
"completions/min_length": 50.0,
|
|
"completions/min_terminated_length": 50.0,
|
|
"epoch": 0.3072,
|
|
"grad_norm": 0.0020764193031936884,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0021,
|
|
"num_tokens": 80575708.0,
|
|
"reward": 1.030700922012329,
|
|
"reward_std": 0.17984464764595032,
|
|
"rewards/accuracy_reward": 0.357421875,
|
|
"rewards/brier_reward": 0.7069083452224731,
|
|
"rewards/confidence_one_or_zero": 0.05517578125,
|
|
"rewards/format_reward": 0.9970703125,
|
|
"rewards/mean_confidence_reward": 0.5960919857025146,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0009765625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 811.0,
|
|
"completions/mean_length": 135.84375,
|
|
"completions/mean_terminated_length": 134.97555541992188,
|
|
"completions/min_length": 36.0,
|
|
"completions/min_terminated_length": 36.0,
|
|
"epoch": 0.32,
|
|
"grad_norm": 0.0012462260201573372,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0008,
|
|
"num_tokens": 83862556.0,
|
|
"reward": 1.0884177684783936,
|
|
"reward_std": 0.16984757781028748,
|
|
"rewards/accuracy_reward": 0.43359375,
|
|
"rewards/brier_reward": 0.7461701035499573,
|
|
"rewards/confidence_one_or_zero": 0.060546875,
|
|
"rewards/format_reward": 0.9970703125,
|
|
"rewards/mean_confidence_reward": 0.5713028907775879,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00146484375,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 505.0,
|
|
"completions/mean_length": 138.23193359375,
|
|
"completions/mean_terminated_length": 136.93251037597656,
|
|
"completions/min_length": 44.0,
|
|
"completions/min_terminated_length": 44.0,
|
|
"epoch": 0.3328,
|
|
"grad_norm": 0.0014977873070165515,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0025,
|
|
"num_tokens": 87091063.0,
|
|
"reward": 1.076505184173584,
|
|
"reward_std": 0.16160905361175537,
|
|
"rewards/accuracy_reward": 0.4140625,
|
|
"rewards/brier_reward": 0.741388201713562,
|
|
"rewards/confidence_one_or_zero": 0.0517578125,
|
|
"rewards/format_reward": 0.99755859375,
|
|
"rewards/mean_confidence_reward": 0.5425801277160645,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0009765625,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 836.0,
|
|
"completions/mean_length": 137.69921875,
|
|
"completions/mean_terminated_length": 136.8328399658203,
|
|
"completions/min_length": 43.0,
|
|
"completions/min_terminated_length": 43.0,
|
|
"epoch": 0.3456,
|
|
"grad_norm": 0.0013432031264528632,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0009,
|
|
"num_tokens": 90390975.0,
|
|
"reward": 1.0520920753479004,
|
|
"reward_std": 0.1620258092880249,
|
|
"rewards/accuracy_reward": 0.36328125,
|
|
"rewards/brier_reward": 0.7428549528121948,
|
|
"rewards/confidence_one_or_zero": 0.048828125,
|
|
"rewards/format_reward": 0.998046875,
|
|
"rewards/mean_confidence_reward": 0.5067415237426758,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00048828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 522.0,
|
|
"completions/mean_length": 139.10302734375,
|
|
"completions/mean_terminated_length": 138.6707305908203,
|
|
"completions/min_length": 48.0,
|
|
"completions/min_terminated_length": 48.0,
|
|
"epoch": 0.3584,
|
|
"grad_norm": 0.001807131338864565,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0007,
|
|
"num_tokens": 93687954.0,
|
|
"reward": 1.068045735359192,
|
|
"reward_std": 0.14087209105491638,
|
|
"rewards/accuracy_reward": 0.3828125,
|
|
"rewards/brier_reward": 0.7547427415847778,
|
|
"rewards/confidence_one_or_zero": 0.04541015625,
|
|
"rewards/format_reward": 0.99853515625,
|
|
"rewards/mean_confidence_reward": 0.4919547438621521,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00048828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 370.0,
|
|
"completions/mean_length": 144.34716796875,
|
|
"completions/mean_terminated_length": 143.9174346923828,
|
|
"completions/min_length": 52.0,
|
|
"completions/min_terminated_length": 52.0,
|
|
"epoch": 0.3712,
|
|
"grad_norm": 0.0023208060301840305,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0012,
|
|
"num_tokens": 97022633.0,
|
|
"reward": 1.065900206565857,
|
|
"reward_std": 0.13991808891296387,
|
|
"rewards/accuracy_reward": 0.37255859375,
|
|
"rewards/brier_reward": 0.7607055902481079,
|
|
"rewards/confidence_one_or_zero": 0.05712890625,
|
|
"rewards/format_reward": 0.99853515625,
|
|
"rewards/mean_confidence_reward": 0.4605600833892822,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 589.0,
|
|
"completions/max_terminated_length": 589.0,
|
|
"completions/mean_length": 143.078125,
|
|
"completions/mean_terminated_length": 143.078125,
|
|
"completions/min_length": 51.0,
|
|
"completions/min_terminated_length": 51.0,
|
|
"epoch": 0.384,
|
|
"grad_norm": 0.0033083283342421055,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0001,
|
|
"num_tokens": 100277401.0,
|
|
"reward": 1.080049753189087,
|
|
"reward_std": 0.13700106739997864,
|
|
"rewards/accuracy_reward": 0.396484375,
|
|
"rewards/brier_reward": 0.7650789618492126,
|
|
"rewards/confidence_one_or_zero": 0.0498046875,
|
|
"rewards/format_reward": 0.99853515625,
|
|
"rewards/mean_confidence_reward": 0.4552270770072937,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.384,
|
|
"eval_completions/clipped_ratio": 0.0,
|
|
"eval_completions/max_length": 353.25,
|
|
"eval_completions/max_terminated_length": 353.25,
|
|
"eval_completions/mean_length": 144.8663787841797,
|
|
"eval_completions/mean_terminated_length": 144.8663787841797,
|
|
"eval_completions/min_length": 73.75,
|
|
"eval_completions/min_terminated_length": 73.75,
|
|
"eval_loss": 0.0,
|
|
"eval_num_tokens": 100277401.0,
|
|
"eval_reward": 1.0514324307441711,
|
|
"eval_reward_std": 0.226553276181221,
|
|
"eval_rewards/accuracy_reward": 0.330078125,
|
|
"eval_rewards/brier_reward": 0.7727857530117035,
|
|
"eval_rewards/confidence_one_or_zero": 0.0546875,
|
|
"eval_rewards/format_reward": 1.0,
|
|
"eval_rewards/mean_confidence_reward": 0.4388085976243019,
|
|
"eval_runtime": 36.7184,
|
|
"eval_samples_per_second": 13.617,
|
|
"eval_steps_per_second": 0.109,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00048828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 397.0,
|
|
"completions/mean_length": 143.3134765625,
|
|
"completions/mean_terminated_length": 142.88323974609375,
|
|
"completions/min_length": 57.0,
|
|
"completions/min_terminated_length": 57.0,
|
|
"epoch": 0.3968,
|
|
"grad_norm": 0.0009655402973294258,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0001,
|
|
"num_tokens": 103574067.0,
|
|
"reward": 1.0772579908370972,
|
|
"reward_std": 0.13317140936851501,
|
|
"rewards/accuracy_reward": 0.3837890625,
|
|
"rewards/brier_reward": 0.7721908092498779,
|
|
"rewards/confidence_one_or_zero": 0.05224609375,
|
|
"rewards/format_reward": 0.99853515625,
|
|
"rewards/mean_confidence_reward": 0.4339550733566284,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00048828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 523.0,
|
|
"completions/mean_length": 147.052734375,
|
|
"completions/mean_terminated_length": 146.62432861328125,
|
|
"completions/min_length": 58.0,
|
|
"completions/min_terminated_length": 58.0,
|
|
"epoch": 0.4096,
|
|
"grad_norm": 0.0014063868438825011,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 106852399.0,
|
|
"reward": 1.0804073810577393,
|
|
"reward_std": 0.12738269567489624,
|
|
"rewards/accuracy_reward": 0.39697265625,
|
|
"rewards/brier_reward": 0.7653061151504517,
|
|
"rewards/confidence_one_or_zero": 0.04296875,
|
|
"rewards/format_reward": 0.99853515625,
|
|
"rewards/mean_confidence_reward": 0.42521679401397705,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00048828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 426.0,
|
|
"completions/mean_length": 144.61669921875,
|
|
"completions/mean_terminated_length": 144.18710327148438,
|
|
"completions/min_length": 58.0,
|
|
"completions/min_terminated_length": 58.0,
|
|
"epoch": 0.4224,
|
|
"grad_norm": 0.0014064498245716095,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 110157062.0,
|
|
"reward": 1.0807609558105469,
|
|
"reward_std": 0.1330416202545166,
|
|
"rewards/accuracy_reward": 0.39404296875,
|
|
"rewards/brier_reward": 0.7699194550514221,
|
|
"rewards/confidence_one_or_zero": 0.048828125,
|
|
"rewards/format_reward": 0.99755859375,
|
|
"rewards/mean_confidence_reward": 0.4001578688621521,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 361.0,
|
|
"completions/max_terminated_length": 361.0,
|
|
"completions/mean_length": 150.60107421875,
|
|
"completions/mean_terminated_length": 150.60107421875,
|
|
"completions/min_length": 67.0,
|
|
"completions/min_terminated_length": 67.0,
|
|
"epoch": 0.4352,
|
|
"grad_norm": 0.0029456529300659895,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0003,
|
|
"num_tokens": 113450349.0,
|
|
"reward": 1.104508876800537,
|
|
"reward_std": 0.11853398382663727,
|
|
"rewards/accuracy_reward": 0.43408203125,
|
|
"rewards/brier_reward": 0.7754232287406921,
|
|
"rewards/confidence_one_or_zero": 0.04541015625,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.4032275378704071,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 432.0,
|
|
"completions/max_terminated_length": 432.0,
|
|
"completions/mean_length": 151.95751953125,
|
|
"completions/mean_terminated_length": 151.95751953125,
|
|
"completions/min_length": 50.0,
|
|
"completions/min_terminated_length": 50.0,
|
|
"epoch": 0.448,
|
|
"grad_norm": 0.0026214425452053547,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0004,
|
|
"num_tokens": 116759078.0,
|
|
"reward": 1.0844058990478516,
|
|
"reward_std": 0.11485770344734192,
|
|
"rewards/accuracy_reward": 0.408203125,
|
|
"rewards/brier_reward": 0.7615846395492554,
|
|
"rewards/confidence_one_or_zero": 0.044921875,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.38147109746932983,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 505.0,
|
|
"completions/max_terminated_length": 505.0,
|
|
"completions/mean_length": 153.82666015625,
|
|
"completions/mean_terminated_length": 153.82666015625,
|
|
"completions/min_length": 69.0,
|
|
"completions/min_terminated_length": 69.0,
|
|
"epoch": 0.4608,
|
|
"grad_norm": 0.0012385396985337138,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0007,
|
|
"num_tokens": 120127755.0,
|
|
"reward": 1.0605785846710205,
|
|
"reward_std": 0.10509373247623444,
|
|
"rewards/accuracy_reward": 0.3427734375,
|
|
"rewards/brier_reward": 0.7788711190223694,
|
|
"rewards/confidence_one_or_zero": 0.04248046875,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.3768359422683716,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00048828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 590.0,
|
|
"completions/mean_length": 153.3173828125,
|
|
"completions/mean_terminated_length": 152.8920440673828,
|
|
"completions/min_length": 66.0,
|
|
"completions/min_terminated_length": 66.0,
|
|
"epoch": 0.4736,
|
|
"grad_norm": 0.002045744564384222,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 123451997.0,
|
|
"reward": 1.0735026597976685,
|
|
"reward_std": 0.10574886202812195,
|
|
"rewards/accuracy_reward": 0.37060546875,
|
|
"rewards/brier_reward": 0.7773755788803101,
|
|
"rewards/confidence_one_or_zero": 0.0498046875,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.3696533143520355,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 633.0,
|
|
"completions/max_terminated_length": 633.0,
|
|
"completions/mean_length": 152.341796875,
|
|
"completions/mean_terminated_length": 152.341796875,
|
|
"completions/min_length": 57.0,
|
|
"completions/min_terminated_length": 57.0,
|
|
"epoch": 0.4864,
|
|
"grad_norm": 0.003814885625615716,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0004,
|
|
"num_tokens": 126776897.0,
|
|
"reward": 1.097827434539795,
|
|
"reward_std": 0.10771030187606812,
|
|
"rewards/accuracy_reward": 0.439453125,
|
|
"rewards/brier_reward": 0.7566891312599182,
|
|
"rewards/confidence_one_or_zero": 0.05615234375,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.34937989711761475,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 411.0,
|
|
"completions/max_terminated_length": 411.0,
|
|
"completions/mean_length": 156.4228515625,
|
|
"completions/mean_terminated_length": 156.4228515625,
|
|
"completions/min_length": 59.0,
|
|
"completions/min_terminated_length": 59.0,
|
|
"epoch": 0.4992,
|
|
"grad_norm": 0.0009488159557804465,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0005,
|
|
"num_tokens": 130164187.0,
|
|
"reward": 1.098426103591919,
|
|
"reward_std": 0.1080411821603775,
|
|
"rewards/accuracy_reward": 0.44775390625,
|
|
"rewards/brier_reward": 0.7495858073234558,
|
|
"rewards/confidence_one_or_zero": 0.04248046875,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.35322752594947815,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 375.0,
|
|
"completions/max_terminated_length": 375.0,
|
|
"completions/mean_length": 155.7001953125,
|
|
"completions/mean_terminated_length": 155.7001953125,
|
|
"completions/min_length": 59.0,
|
|
"completions/min_terminated_length": 59.0,
|
|
"epoch": 0.512,
|
|
"grad_norm": 0.0009362637065351009,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0004,
|
|
"num_tokens": 133507421.0,
|
|
"reward": 1.0903565883636475,
|
|
"reward_std": 0.10990739613771439,
|
|
"rewards/accuracy_reward": 0.41455078125,
|
|
"rewards/brier_reward": 0.7676265239715576,
|
|
"rewards/confidence_one_or_zero": 0.03759765625,
|
|
"rewards/format_reward": 0.99853515625,
|
|
"rewards/mean_confidence_reward": 0.3521386981010437,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 608.0,
|
|
"completions/max_terminated_length": 608.0,
|
|
"completions/mean_length": 155.94287109375,
|
|
"completions/mean_terminated_length": 155.94287109375,
|
|
"completions/min_length": 64.0,
|
|
"completions/min_terminated_length": 64.0,
|
|
"epoch": 0.5248,
|
|
"grad_norm": 0.0018933570245280862,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0003,
|
|
"num_tokens": 136822400.0,
|
|
"reward": 1.0992754697799683,
|
|
"reward_std": 0.10722452402114868,
|
|
"rewards/accuracy_reward": 0.42822265625,
|
|
"rewards/brier_reward": 0.7717922925949097,
|
|
"rewards/confidence_one_or_zero": 0.05810546875,
|
|
"rewards/format_reward": 0.99853515625,
|
|
"rewards/mean_confidence_reward": 0.35319823026657104,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 413.0,
|
|
"completions/max_terminated_length": 413.0,
|
|
"completions/mean_length": 159.2255859375,
|
|
"completions/mean_terminated_length": 159.2255859375,
|
|
"completions/min_length": 69.0,
|
|
"completions/min_terminated_length": 69.0,
|
|
"epoch": 0.5376,
|
|
"grad_norm": 0.0014252394903451204,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 140161310.0,
|
|
"reward": 1.1002483367919922,
|
|
"reward_std": 0.10786743462085724,
|
|
"rewards/accuracy_reward": 0.43994140625,
|
|
"rewards/brier_reward": 0.7615311145782471,
|
|
"rewards/confidence_one_or_zero": 0.04345703125,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.3506225645542145,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 499.0,
|
|
"completions/max_terminated_length": 499.0,
|
|
"completions/mean_length": 160.8203125,
|
|
"completions/mean_terminated_length": 160.8203125,
|
|
"completions/min_length": 70.0,
|
|
"completions/min_terminated_length": 70.0,
|
|
"epoch": 0.5504,
|
|
"grad_norm": 0.0009207865223288536,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0002,
|
|
"num_tokens": 143509286.0,
|
|
"reward": 1.1000754833221436,
|
|
"reward_std": 0.11076341569423676,
|
|
"rewards/accuracy_reward": 0.44384765625,
|
|
"rewards/brier_reward": 0.7572791576385498,
|
|
"rewards/confidence_one_or_zero": 0.03955078125,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.35358887910842896,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 467.0,
|
|
"completions/max_terminated_length": 467.0,
|
|
"completions/mean_length": 160.43994140625,
|
|
"completions/mean_terminated_length": 160.43994140625,
|
|
"completions/min_length": 72.0,
|
|
"completions/min_terminated_length": 72.0,
|
|
"epoch": 0.5632,
|
|
"grad_norm": 0.0006189781124703586,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0004,
|
|
"num_tokens": 146836963.0,
|
|
"reward": 1.0809619426727295,
|
|
"reward_std": 0.09901401400566101,
|
|
"rewards/accuracy_reward": 0.38720703125,
|
|
"rewards/brier_reward": 0.7761809825897217,
|
|
"rewards/confidence_one_or_zero": 0.04248046875,
|
|
"rewards/format_reward": 0.99853515625,
|
|
"rewards/mean_confidence_reward": 0.34800535440444946,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 413.0,
|
|
"completions/max_terminated_length": 413.0,
|
|
"completions/mean_length": 159.8115234375,
|
|
"completions/mean_terminated_length": 159.8115234375,
|
|
"completions/min_length": 70.0,
|
|
"completions/min_terminated_length": 70.0,
|
|
"epoch": 0.576,
|
|
"grad_norm": 0.02215094119310379,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0001,
|
|
"num_tokens": 150188409.0,
|
|
"reward": 1.0951708555221558,
|
|
"reward_std": 0.10278713703155518,
|
|
"rewards/accuracy_reward": 0.4228515625,
|
|
"rewards/brier_reward": 0.7679775953292847,
|
|
"rewards/confidence_one_or_zero": 0.033203125,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.35493797063827515,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.576,
|
|
"eval_completions/clipped_ratio": 0.0,
|
|
"eval_completions/max_length": 287.75,
|
|
"eval_completions/max_terminated_length": 287.75,
|
|
"eval_completions/mean_length": 161.08263778686523,
|
|
"eval_completions/mean_terminated_length": 161.08263778686523,
|
|
"eval_completions/min_length": 85.5,
|
|
"eval_completions/min_terminated_length": 85.5,
|
|
"eval_loss": 0.0,
|
|
"eval_num_tokens": 150188409.0,
|
|
"eval_reward": 1.0704601407051086,
|
|
"eval_reward_std": 0.18598725646734238,
|
|
"eval_rewards/accuracy_reward": 0.349609375,
|
|
"eval_rewards/brier_reward": 0.7913101464509964,
|
|
"eval_rewards/confidence_one_or_zero": 0.03125,
|
|
"eval_rewards/format_reward": 1.0,
|
|
"eval_rewards/mean_confidence_reward": 0.36238280683755875,
|
|
"eval_runtime": 33.0054,
|
|
"eval_samples_per_second": 15.149,
|
|
"eval_steps_per_second": 0.121,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 478.0,
|
|
"completions/max_terminated_length": 478.0,
|
|
"completions/mean_length": 159.986328125,
|
|
"completions/mean_terminated_length": 159.986328125,
|
|
"completions/min_length": 35.0,
|
|
"completions/min_terminated_length": 35.0,
|
|
"epoch": 0.5888,
|
|
"grad_norm": 0.003660782240331173,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0006,
|
|
"num_tokens": 153552477.0,
|
|
"reward": 1.095861792564392,
|
|
"reward_std": 0.0982513576745987,
|
|
"rewards/accuracy_reward": 0.4150390625,
|
|
"rewards/brier_reward": 0.7776603102684021,
|
|
"rewards/confidence_one_or_zero": 0.048828125,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.36439940333366394,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 443.0,
|
|
"completions/max_terminated_length": 443.0,
|
|
"completions/mean_length": 161.75146484375,
|
|
"completions/mean_terminated_length": 161.75146484375,
|
|
"completions/min_length": 72.0,
|
|
"completions/min_terminated_length": 72.0,
|
|
"epoch": 0.6016,
|
|
"grad_norm": 0.004005622584372759,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0001,
|
|
"num_tokens": 156878152.0,
|
|
"reward": 1.1043840646743774,
|
|
"reward_std": 0.10187876969575882,
|
|
"rewards/accuracy_reward": 0.43408203125,
|
|
"rewards/brier_reward": 0.7761501669883728,
|
|
"rewards/confidence_one_or_zero": 0.03759765625,
|
|
"rewards/format_reward": 0.99853515625,
|
|
"rewards/mean_confidence_reward": 0.3677002191543579,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 361.0,
|
|
"completions/max_terminated_length": 361.0,
|
|
"completions/mean_length": 163.7666015625,
|
|
"completions/mean_terminated_length": 163.7666015625,
|
|
"completions/min_length": 69.0,
|
|
"completions/min_terminated_length": 69.0,
|
|
"epoch": 0.6144,
|
|
"grad_norm": 0.0013804073678329587,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0001,
|
|
"num_tokens": 160274850.0,
|
|
"reward": 1.1110379695892334,
|
|
"reward_std": 0.09990298002958298,
|
|
"rewards/accuracy_reward": 0.45361328125,
|
|
"rewards/brier_reward": 0.7684618234634399,
|
|
"rewards/confidence_one_or_zero": 0.0341796875,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.3786962926387787,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 571.0,
|
|
"completions/max_terminated_length": 571.0,
|
|
"completions/mean_length": 165.44677734375,
|
|
"completions/mean_terminated_length": 165.44677734375,
|
|
"completions/min_length": 80.0,
|
|
"completions/min_terminated_length": 80.0,
|
|
"epoch": 0.6272,
|
|
"grad_norm": 0.0006175300804898143,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0005,
|
|
"num_tokens": 163673933.0,
|
|
"reward": 1.1008734703063965,
|
|
"reward_std": 0.1041068434715271,
|
|
"rewards/accuracy_reward": 0.43603515625,
|
|
"rewards/brier_reward": 0.7657108902931213,
|
|
"rewards/confidence_one_or_zero": 0.0205078125,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.38051414489746094,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 553.0,
|
|
"completions/max_terminated_length": 553.0,
|
|
"completions/mean_length": 166.4296875,
|
|
"completions/mean_terminated_length": 166.4296875,
|
|
"completions/min_length": 76.0,
|
|
"completions/min_terminated_length": 76.0,
|
|
"epoch": 0.64,
|
|
"grad_norm": 0.0007595557253807783,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0001,
|
|
"num_tokens": 167075853.0,
|
|
"reward": 1.119231104850769,
|
|
"reward_std": 0.10055085271596909,
|
|
"rewards/accuracy_reward": 0.4638671875,
|
|
"rewards/brier_reward": 0.7750825881958008,
|
|
"rewards/confidence_one_or_zero": 0.02685546875,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.40596190094947815,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 452.0,
|
|
"completions/max_terminated_length": 452.0,
|
|
"completions/mean_length": 166.87060546875,
|
|
"completions/mean_terminated_length": 166.87060546875,
|
|
"completions/min_length": 71.0,
|
|
"completions/min_terminated_length": 71.0,
|
|
"epoch": 0.6528,
|
|
"grad_norm": 0.0037527712993323803,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0001,
|
|
"num_tokens": 170378204.0,
|
|
"reward": 1.0836341381072998,
|
|
"reward_std": 0.10692138969898224,
|
|
"rewards/accuracy_reward": 0.39794921875,
|
|
"rewards/brier_reward": 0.7717597484588623,
|
|
"rewards/confidence_one_or_zero": 0.02587890625,
|
|
"rewards/format_reward": 0.99755859375,
|
|
"rewards/mean_confidence_reward": 0.4016748070716858,
|
|
"step": 51
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 527.0,
|
|
"completions/max_terminated_length": 527.0,
|
|
"completions/mean_length": 165.9951171875,
|
|
"completions/mean_terminated_length": 165.9951171875,
|
|
"completions/min_length": 77.0,
|
|
"completions/min_terminated_length": 77.0,
|
|
"epoch": 0.6656,
|
|
"grad_norm": 0.0017999252304434776,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0001,
|
|
"num_tokens": 173709242.0,
|
|
"reward": 1.1069138050079346,
|
|
"reward_std": 0.10350505262613297,
|
|
"rewards/accuracy_reward": 0.4482421875,
|
|
"rewards/brier_reward": 0.7655847072601318,
|
|
"rewards/confidence_one_or_zero": 0.02392578125,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.4061328172683716,
|
|
"step": 52
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 447.0,
|
|
"completions/max_terminated_length": 447.0,
|
|
"completions/mean_length": 167.8916015625,
|
|
"completions/mean_terminated_length": 167.8916015625,
|
|
"completions/min_length": 70.0,
|
|
"completions/min_terminated_length": 70.0,
|
|
"epoch": 0.6784,
|
|
"grad_norm": 0.002350540366023779,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 177022084.0,
|
|
"reward": 1.08831787109375,
|
|
"reward_std": 0.10483807325363159,
|
|
"rewards/accuracy_reward": 0.39990234375,
|
|
"rewards/brier_reward": 0.7767325639724731,
|
|
"rewards/confidence_one_or_zero": 0.02294921875,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.4102539122104645,
|
|
"step": 53
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 601.0,
|
|
"completions/max_terminated_length": 601.0,
|
|
"completions/mean_length": 170.22705078125,
|
|
"completions/mean_terminated_length": 170.22705078125,
|
|
"completions/min_length": 81.0,
|
|
"completions/min_terminated_length": 81.0,
|
|
"epoch": 0.6912,
|
|
"grad_norm": 0.0024247784167528152,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0002,
|
|
"num_tokens": 180378381.0,
|
|
"reward": 1.0937700271606445,
|
|
"reward_std": 0.11120226234197617,
|
|
"rewards/accuracy_reward": 0.41552734375,
|
|
"rewards/brier_reward": 0.7725003957748413,
|
|
"rewards/confidence_one_or_zero": 0.01171875,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.41514649987220764,
|
|
"step": 54
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 481.0,
|
|
"completions/max_terminated_length": 481.0,
|
|
"completions/mean_length": 171.05126953125,
|
|
"completions/mean_terminated_length": 171.05126953125,
|
|
"completions/min_length": 82.0,
|
|
"completions/min_terminated_length": 82.0,
|
|
"epoch": 0.704,
|
|
"grad_norm": 0.0025645701680332422,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0002,
|
|
"num_tokens": 183697854.0,
|
|
"reward": 1.1110807657241821,
|
|
"reward_std": 0.11133909225463867,
|
|
"rewards/accuracy_reward": 0.45947265625,
|
|
"rewards/brier_reward": 0.7631762027740479,
|
|
"rewards/confidence_one_or_zero": 0.0224609375,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.42967772483825684,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 569.0,
|
|
"completions/max_terminated_length": 569.0,
|
|
"completions/mean_length": 171.7705078125,
|
|
"completions/mean_terminated_length": 171.7705078125,
|
|
"completions/min_length": 80.0,
|
|
"completions/min_terminated_length": 80.0,
|
|
"epoch": 0.7168,
|
|
"grad_norm": 0.0012854809174314141,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0004,
|
|
"num_tokens": 187083480.0,
|
|
"reward": 1.1149908304214478,
|
|
"reward_std": 0.11459929496049881,
|
|
"rewards/accuracy_reward": 0.4580078125,
|
|
"rewards/brier_reward": 0.7744144797325134,
|
|
"rewards/confidence_one_or_zero": 0.0146484375,
|
|
"rewards/format_reward": 0.99755859375,
|
|
"rewards/mean_confidence_reward": 0.4292285144329071,
|
|
"step": 56
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 503.0,
|
|
"completions/max_terminated_length": 503.0,
|
|
"completions/mean_length": 171.90771484375,
|
|
"completions/mean_terminated_length": 171.90771484375,
|
|
"completions/min_length": 58.0,
|
|
"completions/min_terminated_length": 58.0,
|
|
"epoch": 0.7296,
|
|
"grad_norm": 0.0018713475437834859,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0003,
|
|
"num_tokens": 190385139.0,
|
|
"reward": 1.1170523166656494,
|
|
"reward_std": 0.11444520950317383,
|
|
"rewards/accuracy_reward": 0.4619140625,
|
|
"rewards/brier_reward": 0.772189736366272,
|
|
"rewards/confidence_one_or_zero": 0.02001953125,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.4356445372104645,
|
|
"step": 57
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 401.0,
|
|
"completions/max_terminated_length": 401.0,
|
|
"completions/mean_length": 170.96044921875,
|
|
"completions/mean_terminated_length": 170.96044921875,
|
|
"completions/min_length": 60.0,
|
|
"completions/min_terminated_length": 60.0,
|
|
"epoch": 0.7424,
|
|
"grad_norm": 0.0026220239233225584,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0008,
|
|
"num_tokens": 193783130.0,
|
|
"reward": 1.1173450946807861,
|
|
"reward_std": 0.11841192841529846,
|
|
"rewards/accuracy_reward": 0.46337890625,
|
|
"rewards/brier_reward": 0.7713105082511902,
|
|
"rewards/confidence_one_or_zero": 0.01416015625,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.4385010004043579,
|
|
"step": 58
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 364.0,
|
|
"completions/max_terminated_length": 364.0,
|
|
"completions/mean_length": 173.064453125,
|
|
"completions/mean_terminated_length": 173.064453125,
|
|
"completions/min_length": 75.0,
|
|
"completions/min_terminated_length": 75.0,
|
|
"epoch": 0.7552,
|
|
"grad_norm": 0.0012924442999064922,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0012,
|
|
"num_tokens": 197140494.0,
|
|
"reward": 1.105983018875122,
|
|
"reward_std": 0.107911616563797,
|
|
"rewards/accuracy_reward": 0.43505859375,
|
|
"rewards/brier_reward": 0.7773948907852173,
|
|
"rewards/confidence_one_or_zero": 0.015625,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.44759440422058105,
|
|
"step": 59
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00048828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 757.0,
|
|
"completions/mean_length": 176.484375,
|
|
"completions/mean_terminated_length": 176.07034301757812,
|
|
"completions/min_length": 76.0,
|
|
"completions/min_terminated_length": 76.0,
|
|
"epoch": 0.768,
|
|
"grad_norm": 0.0028388705104589462,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0005,
|
|
"num_tokens": 200495046.0,
|
|
"reward": 1.110207438468933,
|
|
"reward_std": 0.11490217596292496,
|
|
"rewards/accuracy_reward": 0.44677734375,
|
|
"rewards/brier_reward": 0.7746132612228394,
|
|
"rewards/confidence_one_or_zero": 0.01171875,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.45084962248802185,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.768,
|
|
"eval_completions/clipped_ratio": 0.0,
|
|
"eval_completions/max_length": 340.5,
|
|
"eval_completions/max_terminated_length": 340.5,
|
|
"eval_completions/mean_length": 177.17342376708984,
|
|
"eval_completions/mean_terminated_length": 177.17342376708984,
|
|
"eval_completions/min_length": 83.5,
|
|
"eval_completions/min_terminated_length": 83.5,
|
|
"eval_loss": 0.0,
|
|
"eval_num_tokens": 200495046.0,
|
|
"eval_reward": 1.060145229101181,
|
|
"eval_reward_std": 0.23021632805466652,
|
|
"eval_rewards/accuracy_reward": 0.3515625,
|
|
"eval_rewards/brier_reward": 0.7706802636384964,
|
|
"eval_rewards/confidence_one_or_zero": 0.013671875,
|
|
"eval_rewards/format_reward": 0.998046875,
|
|
"eval_rewards/mean_confidence_reward": 0.44810547679662704,
|
|
"eval_runtime": 35.5968,
|
|
"eval_samples_per_second": 14.046,
|
|
"eval_steps_per_second": 0.112,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 473.0,
|
|
"completions/max_terminated_length": 473.0,
|
|
"completions/mean_length": 175.306640625,
|
|
"completions/mean_terminated_length": 175.306640625,
|
|
"completions/min_length": 76.0,
|
|
"completions/min_terminated_length": 76.0,
|
|
"epoch": 0.7808,
|
|
"grad_norm": 0.0019142339006066322,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0001,
|
|
"num_tokens": 203882754.0,
|
|
"reward": 1.1105574369430542,
|
|
"reward_std": 0.1143236756324768,
|
|
"rewards/accuracy_reward": 0.44970703125,
|
|
"rewards/brier_reward": 0.771406888961792,
|
|
"rewards/confidence_one_or_zero": 0.01416015625,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.44986817240715027,
|
|
"step": 61
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 479.0,
|
|
"completions/max_terminated_length": 479.0,
|
|
"completions/mean_length": 175.62744140625,
|
|
"completions/mean_terminated_length": 175.62744140625,
|
|
"completions/min_length": 70.0,
|
|
"completions/min_terminated_length": 70.0,
|
|
"epoch": 0.7936,
|
|
"grad_norm": 0.0028708542231470346,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.001,
|
|
"num_tokens": 207246823.0,
|
|
"reward": 1.115567922592163,
|
|
"reward_std": 0.11952622979879379,
|
|
"rewards/accuracy_reward": 0.4619140625,
|
|
"rewards/brier_reward": 0.770685613155365,
|
|
"rewards/confidence_one_or_zero": 0.01708984375,
|
|
"rewards/format_reward": 0.99853515625,
|
|
"rewards/mean_confidence_reward": 0.4633447527885437,
|
|
"step": 62
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 388.0,
|
|
"completions/max_terminated_length": 388.0,
|
|
"completions/mean_length": 171.20751953125,
|
|
"completions/mean_terminated_length": 171.20751953125,
|
|
"completions/min_length": 79.0,
|
|
"completions/min_terminated_length": 79.0,
|
|
"epoch": 0.8064,
|
|
"grad_norm": 0.013748224824666977,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0002,
|
|
"num_tokens": 210614152.0,
|
|
"reward": 1.1377573013305664,
|
|
"reward_std": 0.10060383379459381,
|
|
"rewards/accuracy_reward": 0.50634765625,
|
|
"rewards/brier_reward": 0.769166111946106,
|
|
"rewards/confidence_one_or_zero": 0.01708984375,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.47361326217651367,
|
|
"step": 63
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 546.0,
|
|
"completions/max_terminated_length": 546.0,
|
|
"completions/mean_length": 173.845703125,
|
|
"completions/mean_terminated_length": 173.845703125,
|
|
"completions/min_length": 71.0,
|
|
"completions/min_terminated_length": 71.0,
|
|
"epoch": 0.8192,
|
|
"grad_norm": 0.0016836629947647452,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0014,
|
|
"num_tokens": 213986532.0,
|
|
"reward": 1.1204241514205933,
|
|
"reward_std": 0.12249487638473511,
|
|
"rewards/accuracy_reward": 0.466796875,
|
|
"rewards/brier_reward": 0.7740505933761597,
|
|
"rewards/confidence_one_or_zero": 0.01806640625,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.4648388624191284,
|
|
"step": 64
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 401.0,
|
|
"completions/max_terminated_length": 401.0,
|
|
"completions/mean_length": 175.62158203125,
|
|
"completions/mean_terminated_length": 175.62158203125,
|
|
"completions/min_length": 76.0,
|
|
"completions/min_terminated_length": 76.0,
|
|
"epoch": 0.832,
|
|
"grad_norm": 0.008333772420883179,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0006,
|
|
"num_tokens": 217353213.0,
|
|
"reward": 1.1017959117889404,
|
|
"reward_std": 0.11205719411373138,
|
|
"rewards/accuracy_reward": 0.43310546875,
|
|
"rewards/brier_reward": 0.7714620232582092,
|
|
"rewards/confidence_one_or_zero": 0.0166015625,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.46193361282348633,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 442.0,
|
|
"completions/max_terminated_length": 442.0,
|
|
"completions/mean_length": 174.36376953125,
|
|
"completions/mean_terminated_length": 174.36376953125,
|
|
"completions/min_length": 79.0,
|
|
"completions/min_terminated_length": 79.0,
|
|
"epoch": 0.8448,
|
|
"grad_norm": 0.0011087502352893353,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0003,
|
|
"num_tokens": 220702790.0,
|
|
"reward": 1.1077888011932373,
|
|
"reward_std": 0.11732746660709381,
|
|
"rewards/accuracy_reward": 0.443359375,
|
|
"rewards/brier_reward": 0.773193895816803,
|
|
"rewards/confidence_one_or_zero": 0.0146484375,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.4771631062030792,
|
|
"step": 66
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 563.0,
|
|
"completions/max_terminated_length": 563.0,
|
|
"completions/mean_length": 180.34814453125,
|
|
"completions/mean_terminated_length": 180.34814453125,
|
|
"completions/min_length": 86.0,
|
|
"completions/min_terminated_length": 86.0,
|
|
"epoch": 0.8576,
|
|
"grad_norm": 0.0014937082305550575,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0008,
|
|
"num_tokens": 224068031.0,
|
|
"reward": 1.111690640449524,
|
|
"reward_std": 0.10702188313007355,
|
|
"rewards/accuracy_reward": 0.44921875,
|
|
"rewards/brier_reward": 0.774161696434021,
|
|
"rewards/confidence_one_or_zero": 0.0107421875,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.47103172540664673,
|
|
"step": 67
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 496.0,
|
|
"completions/max_terminated_length": 496.0,
|
|
"completions/mean_length": 179.4482421875,
|
|
"completions/mean_terminated_length": 179.4482421875,
|
|
"completions/min_length": 80.0,
|
|
"completions/min_terminated_length": 80.0,
|
|
"epoch": 0.8704,
|
|
"grad_norm": 0.003253382397815585,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0001,
|
|
"num_tokens": 227500213.0,
|
|
"reward": 1.1289840936660767,
|
|
"reward_std": 0.11423557996749878,
|
|
"rewards/accuracy_reward": 0.4765625,
|
|
"rewards/brier_reward": 0.7823812365531921,
|
|
"rewards/confidence_one_or_zero": 0.01513671875,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.4756738245487213,
|
|
"step": 68
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 506.0,
|
|
"completions/max_terminated_length": 506.0,
|
|
"completions/mean_length": 177.3544921875,
|
|
"completions/mean_terminated_length": 177.3544921875,
|
|
"completions/min_length": 65.0,
|
|
"completions/min_terminated_length": 65.0,
|
|
"epoch": 0.8832,
|
|
"grad_norm": 0.000763367279432714,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0004,
|
|
"num_tokens": 230841907.0,
|
|
"reward": 1.0981295108795166,
|
|
"reward_std": 0.11358843743801117,
|
|
"rewards/accuracy_reward": 0.41943359375,
|
|
"rewards/brier_reward": 0.7768245935440063,
|
|
"rewards/confidence_one_or_zero": 0.015625,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.4759277403354645,
|
|
"step": 69
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00048828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 411.0,
|
|
"completions/mean_length": 178.068359375,
|
|
"completions/mean_terminated_length": 177.6551055908203,
|
|
"completions/min_length": 96.0,
|
|
"completions/min_terminated_length": 96.0,
|
|
"epoch": 0.896,
|
|
"grad_norm": 0.000891282397788018,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0003,
|
|
"num_tokens": 234239855.0,
|
|
"reward": 1.1204962730407715,
|
|
"reward_std": 0.11121779680252075,
|
|
"rewards/accuracy_reward": 0.4697265625,
|
|
"rewards/brier_reward": 0.7717532515525818,
|
|
"rewards/confidence_one_or_zero": 0.01318359375,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.48045408725738525,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 453.0,
|
|
"completions/max_terminated_length": 453.0,
|
|
"completions/mean_length": 181.03125,
|
|
"completions/mean_terminated_length": 181.03125,
|
|
"completions/min_length": 82.0,
|
|
"completions/min_terminated_length": 82.0,
|
|
"epoch": 0.9088,
|
|
"grad_norm": 0.0010501582873985171,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0008,
|
|
"num_tokens": 237612871.0,
|
|
"reward": 1.1125749349594116,
|
|
"reward_std": 0.11690981686115265,
|
|
"rewards/accuracy_reward": 0.45654296875,
|
|
"rewards/brier_reward": 0.7695823907852173,
|
|
"rewards/confidence_one_or_zero": 0.01025390625,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.47194337844848633,
|
|
"step": 71
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 564.0,
|
|
"completions/max_terminated_length": 564.0,
|
|
"completions/mean_length": 179.0400390625,
|
|
"completions/mean_terminated_length": 179.0400390625,
|
|
"completions/min_length": 88.0,
|
|
"completions/min_terminated_length": 88.0,
|
|
"epoch": 0.9216,
|
|
"grad_norm": 0.0008443333790637553,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0004,
|
|
"num_tokens": 241019905.0,
|
|
"reward": 1.1193747520446777,
|
|
"reward_std": 0.11691069602966309,
|
|
"rewards/accuracy_reward": 0.470703125,
|
|
"rewards/brier_reward": 0.7690220475196838,
|
|
"rewards/confidence_one_or_zero": 0.02294921875,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.46311524510383606,
|
|
"step": 72
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 504.0,
|
|
"completions/max_terminated_length": 504.0,
|
|
"completions/mean_length": 177.8017578125,
|
|
"completions/mean_terminated_length": 177.8017578125,
|
|
"completions/min_length": 75.0,
|
|
"completions/min_terminated_length": 75.0,
|
|
"epoch": 0.9344,
|
|
"grad_norm": 0.000890648050699383,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0004,
|
|
"num_tokens": 244393011.0,
|
|
"reward": 1.0996851921081543,
|
|
"reward_std": 0.1123179942369461,
|
|
"rewards/accuracy_reward": 0.41845703125,
|
|
"rewards/brier_reward": 0.7809123992919922,
|
|
"rewards/confidence_one_or_zero": 0.02197265625,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.4633203148841858,
|
|
"step": 73
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 591.0,
|
|
"completions/max_terminated_length": 591.0,
|
|
"completions/mean_length": 179.6220703125,
|
|
"completions/mean_terminated_length": 179.6220703125,
|
|
"completions/min_length": 88.0,
|
|
"completions/min_terminated_length": 88.0,
|
|
"epoch": 0.9472,
|
|
"grad_norm": 0.000775814289227128,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0001,
|
|
"num_tokens": 247706293.0,
|
|
"reward": 1.115156650543213,
|
|
"reward_std": 0.10829775035381317,
|
|
"rewards/accuracy_reward": 0.45654296875,
|
|
"rewards/brier_reward": 0.7737695574760437,
|
|
"rewards/confidence_one_or_zero": 0.0185546875,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.4643896520137787,
|
|
"step": 74
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 525.0,
|
|
"completions/max_terminated_length": 525.0,
|
|
"completions/mean_length": 180.197265625,
|
|
"completions/mean_terminated_length": 180.197265625,
|
|
"completions/min_length": 77.0,
|
|
"completions/min_terminated_length": 77.0,
|
|
"epoch": 0.96,
|
|
"grad_norm": 0.0024549230001866817,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0001,
|
|
"num_tokens": 251076793.0,
|
|
"reward": 1.1133463382720947,
|
|
"reward_std": 0.10567133873701096,
|
|
"rewards/accuracy_reward": 0.4453125,
|
|
"rewards/brier_reward": 0.781867504119873,
|
|
"rewards/confidence_one_or_zero": 0.01123046875,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.481499046087265,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"eval_completions/clipped_ratio": 0.0,
|
|
"eval_completions/max_length": 335.25,
|
|
"eval_completions/max_terminated_length": 335.25,
|
|
"eval_completions/mean_length": 180.3135108947754,
|
|
"eval_completions/mean_terminated_length": 180.3135108947754,
|
|
"eval_completions/min_length": 106.0,
|
|
"eval_completions/min_terminated_length": 106.0,
|
|
"eval_loss": 0.0,
|
|
"eval_num_tokens": 251076793.0,
|
|
"eval_reward": 1.0895682871341705,
|
|
"eval_reward_std": 0.24639935791492462,
|
|
"eval_rewards/accuracy_reward": 0.396484375,
|
|
"eval_rewards/brier_reward": 0.7826513648033142,
|
|
"eval_rewards/confidence_one_or_zero": 0.009765625,
|
|
"eval_rewards/format_reward": 1.0,
|
|
"eval_rewards/mean_confidence_reward": 0.46923828125,
|
|
"eval_runtime": 35.4002,
|
|
"eval_samples_per_second": 14.124,
|
|
"eval_steps_per_second": 0.113,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.00048828125,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 534.0,
|
|
"completions/mean_length": 182.37158203125,
|
|
"completions/mean_terminated_length": 181.96043395996094,
|
|
"completions/min_length": 73.0,
|
|
"completions/min_terminated_length": 73.0,
|
|
"epoch": 0.9728,
|
|
"grad_norm": 0.0009286075364798307,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0008,
|
|
"num_tokens": 254430658.0,
|
|
"reward": 1.1125905513763428,
|
|
"reward_std": 0.11536484956741333,
|
|
"rewards/accuracy_reward": 0.45068359375,
|
|
"rewards/brier_reward": 0.7749849557876587,
|
|
"rewards/confidence_one_or_zero": 0.015625,
|
|
"rewards/format_reward": 0.99951171875,
|
|
"rewards/mean_confidence_reward": 0.4785693287849426,
|
|
"step": 76
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 419.0,
|
|
"completions/max_terminated_length": 419.0,
|
|
"completions/mean_length": 181.07470703125,
|
|
"completions/mean_terminated_length": 181.07470703125,
|
|
"completions/min_length": 87.0,
|
|
"completions/min_terminated_length": 87.0,
|
|
"epoch": 0.9856,
|
|
"grad_norm": 0.0007356080459430814,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0001,
|
|
"num_tokens": 257823147.0,
|
|
"reward": 1.113877773284912,
|
|
"reward_std": 0.1039476990699768,
|
|
"rewards/accuracy_reward": 0.44091796875,
|
|
"rewards/brier_reward": 0.7868366837501526,
|
|
"rewards/confidence_one_or_zero": 0.02685546875,
|
|
"rewards/format_reward": 1.0,
|
|
"rewards/mean_confidence_reward": 0.4647803008556366,
|
|
"step": 77
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0006377551020407823,
|
|
"completions/max_length": 1024.0,
|
|
"completions/max_terminated_length": 371.0,
|
|
"completions/mean_length": 179.0899200439453,
|
|
"completions/mean_terminated_length": 178.5507354736328,
|
|
"completions/min_length": 1.0,
|
|
"completions/min_terminated_length": 1.0,
|
|
"epoch": 0.9984,
|
|
"grad_norm": 0.008926053531467915,
|
|
"learning_rate": 1e-06,
|
|
"loss": -0.0,
|
|
"num_tokens": 261189064.0,
|
|
"reward": 1.1190553903579712,
|
|
"reward_std": 0.1117812842130661,
|
|
"rewards/accuracy_reward": 0.466796875,
|
|
"rewards/brier_reward": 0.772289514541626,
|
|
"rewards/confidence_one_or_zero": 0.01611328125,
|
|
"rewards/format_reward": 0.9990234375,
|
|
"rewards/mean_confidence_reward": 0.4710400402545929,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.9984,
|
|
"step": 78,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.006661382568404989,
|
|
"train_runtime": 48105.2196,
|
|
"train_samples_per_second": 0.416,
|
|
"train_steps_per_second": 0.002
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 78,
|
|
"num_input_tokens_seen": 261189064,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 30,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|