Files
qwen2.5-7B-rlcr_g8_b512/trainer_state.json

2264 lines
83 KiB
JSON
Raw Normal View History

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 15,
"global_step": 78,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0595703125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 932.0,
"completions/mean_length": 252.85693359375,
"completions/mean_terminated_length": 204.00987243652344,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0128,
"grad_norm": 0.01206118892878294,
"learning_rate": 2.5e-07,
"loss": 0.062,
"num_tokens": 3474099.0,
"reward": 0.646336019039154,
"reward_std": 0.503198504447937,
"rewards/accuracy_reward": 0.24609375,
"rewards/brier_reward": 0.37909579277038574,
"rewards/confidence_one_or_zero": 0.26953125,
"rewards/format_reward": 0.66748046875,
"rewards/mean_confidence_reward": 0.7516889572143555,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0556640625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 995.0,
"completions/mean_length": 257.6103515625,
"completions/mean_terminated_length": 212.43536376953125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0256,
"grad_norm": 0.05630214512348175,
"learning_rate": 5e-07,
"loss": 0.06,
"num_tokens": 7050869.0,
"reward": 0.5985734462738037,
"reward_std": 0.4480513036251068,
"rewards/accuracy_reward": 0.171875,
"rewards/brier_reward": 0.3416762948036194,
"rewards/confidence_one_or_zero": 0.25537109375,
"rewards/format_reward": 0.68359375,
"rewards/mean_confidence_reward": 0.7263393402099609,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.052734375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1021.0,
"completions/mean_length": 254.6923828125,
"completions/mean_terminated_length": 211.86495971679688,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0384,
"grad_norm": 0.02448289282619953,
"learning_rate": 7.5e-07,
"loss": 0.0532,
"num_tokens": 10556279.0,
"reward": 0.6241230964660645,
"reward_std": 0.45209184288978577,
"rewards/accuracy_reward": 0.203125,
"rewards/brier_reward": 0.36250197887420654,
"rewards/confidence_one_or_zero": 0.26611328125,
"rewards/format_reward": 0.6826171875,
"rewards/mean_confidence_reward": 0.7384810447692871,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05224609375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1021.0,
"completions/mean_length": 255.78857421875,
"completions/mean_terminated_length": 213.43997192382812,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0512,
"grad_norm": 0.009398565627634525,
"learning_rate": 1e-06,
"loss": 0.0588,
"num_tokens": 14071174.0,
"reward": 0.625983715057373,
"reward_std": 0.46379736065864563,
"rewards/accuracy_reward": 0.1982421875,
"rewards/brier_reward": 0.3657350540161133,
"rewards/confidence_one_or_zero": 0.271484375,
"rewards/format_reward": 0.68798828125,
"rewards/mean_confidence_reward": 0.7340266704559326,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04931640625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1012.0,
"completions/mean_length": 250.24267578125,
"completions/mean_terminated_length": 210.104248046875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.064,
"grad_norm": 0.010432912968099117,
"learning_rate": 1e-06,
"loss": 0.0609,
"num_tokens": 17581279.0,
"reward": 0.6377379894256592,
"reward_std": 0.4767524302005768,
"rewards/accuracy_reward": 0.2119140625,
"rewards/brier_reward": 0.37361857295036316,
"rewards/confidence_one_or_zero": 0.2607421875,
"rewards/format_reward": 0.68994140625,
"rewards/mean_confidence_reward": 0.72353196144104,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0419921875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 983.0,
"completions/mean_length": 233.49072265625,
"completions/mean_terminated_length": 198.8404541015625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0768,
"grad_norm": 0.03446255251765251,
"learning_rate": 1e-06,
"loss": 0.0446,
"num_tokens": 21031396.0,
"reward": 0.7123662233352661,
"reward_std": 0.4492800533771515,
"rewards/accuracy_reward": 0.2568359375,
"rewards/brier_reward": 0.41545307636260986,
"rewards/confidence_one_or_zero": 0.2783203125,
"rewards/format_reward": 0.75244140625,
"rewards/mean_confidence_reward": 0.7652001976966858,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0322265625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1020.0,
"completions/mean_length": 218.8916015625,
"completions/mean_terminated_length": 192.08172607421875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0896,
"grad_norm": 0.03061598353087902,
"learning_rate": 1e-06,
"loss": 0.043,
"num_tokens": 24512238.0,
"reward": 0.7533546686172485,
"reward_std": 0.40088847279548645,
"rewards/accuracy_reward": 0.25,
"rewards/brier_reward": 0.43981271982192993,
"rewards/confidence_one_or_zero": 0.259765625,
"rewards/format_reward": 0.81689453125,
"rewards/mean_confidence_reward": 0.7687591314315796,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1009.0,
"completions/mean_length": 187.7744140625,
"completions/mean_terminated_length": 172.81211853027344,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1024,
"grad_norm": 0.014748022891581059,
"learning_rate": 1e-06,
"loss": 0.0338,
"num_tokens": 27904744.0,
"reward": 0.8088976144790649,
"reward_std": 0.3649292290210724,
"rewards/accuracy_reward": 0.26953125,
"rewards/brier_reward": 0.46837902069091797,
"rewards/confidence_one_or_zero": 0.2646484375,
"rewards/format_reward": 0.8798828125,
"rewards/mean_confidence_reward": 0.7798730134963989,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 945.0,
"completions/mean_length": 175.26171875,
"completions/mean_terminated_length": 161.78968811035156,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.1152,
"grad_norm": 0.1248084306716919,
"learning_rate": 1e-06,
"loss": 0.0267,
"num_tokens": 31256936.0,
"reward": 0.8849074840545654,
"reward_std": 0.3317580819129944,
"rewards/accuracy_reward": 0.32470703125,
"rewards/brier_reward": 0.5227425694465637,
"rewards/confidence_one_or_zero": 0.2705078125,
"rewards/format_reward": 0.92236328125,
"rewards/mean_confidence_reward": 0.7816927433013916,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 630.0,
"completions/mean_length": 162.44384765625,
"completions/mean_terminated_length": 148.7683563232422,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.128,
"grad_norm": 0.013320323079824448,
"learning_rate": 1e-06,
"loss": 0.022,
"num_tokens": 34584917.0,
"reward": 0.8880295753479004,
"reward_std": 0.3083075284957886,
"rewards/accuracy_reward": 0.30712890625,
"rewards/brier_reward": 0.5314282178878784,
"rewards/confidence_one_or_zero": 0.2568359375,
"rewards/format_reward": 0.9375,
"rewards/mean_confidence_reward": 0.7687011957168579,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00927734375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 541.0,
"completions/mean_length": 149.92822265625,
"completions/mean_terminated_length": 141.74322509765625,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1408,
"grad_norm": 0.009336220100522041,
"learning_rate": 1e-06,
"loss": 0.0137,
"num_tokens": 37879610.0,
"reward": 0.9682935476303101,
"reward_std": 0.30119815468788147,
"rewards/accuracy_reward": 0.39697265625,
"rewards/brier_reward": 0.5859990119934082,
"rewards/confidence_one_or_zero": 0.27587890625,
"rewards/format_reward": 0.95361328125,
"rewards/mean_confidence_reward": 0.7835807204246521,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1000.0,
"completions/mean_length": 148.54931640625,
"completions/mean_terminated_length": 143.38949584960938,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1536,
"grad_norm": 0.01234795618802309,
"learning_rate": 1e-06,
"loss": 0.0109,
"num_tokens": 41182895.0,
"reward": 0.9633985757827759,
"reward_std": 0.27885445952415466,
"rewards/accuracy_reward": 0.36572265625,
"rewards/brier_reward": 0.588416337966919,
"rewards/confidence_one_or_zero": 0.21923828125,
"rewards/format_reward": 0.97265625,
"rewards/mean_confidence_reward": 0.7653173804283142,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0029296875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 804.0,
"completions/mean_length": 134.34326171875,
"completions/mean_terminated_length": 131.72918701171875,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.1664,
"grad_norm": 0.011425751261413097,
"learning_rate": 1e-06,
"loss": 0.0041,
"num_tokens": 44485246.0,
"reward": 0.9407795071601868,
"reward_std": 0.2531838119029999,
"rewards/accuracy_reward": 0.32666015625,
"rewards/brier_reward": 0.5758930444717407,
"rewards/confidence_one_or_zero": 0.20947265625,
"rewards/format_reward": 0.97900390625,
"rewards/mean_confidence_reward": 0.75456702709198,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0029296875,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 1018.0,
"completions/mean_length": 138.37548828125,
"completions/mean_terminated_length": 135.77325439453125,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1792,
"grad_norm": 0.004313925746828318,
"learning_rate": 1e-06,
"loss": 0.0035,
"num_tokens": 47816527.0,
"reward": 0.9434206485748291,
"reward_std": 0.24209100008010864,
"rewards/accuracy_reward": 0.31884765625,
"rewards/brier_reward": 0.5865465402603149,
"rewards/confidence_one_or_zero": 0.15283203125,
"rewards/format_reward": 0.9814453125,
"rewards/mean_confidence_reward": 0.7383813858032227,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00439453125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 133.2216796875,
"completions/mean_terminated_length": 129.28985595703125,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.192,
"grad_norm": 0.00784427672624588,
"learning_rate": 1e-06,
"loss": 0.006,
"num_tokens": 51033389.0,
"reward": 0.983768105506897,
"reward_std": 0.24871209263801575,
"rewards/accuracy_reward": 0.361328125,
"rewards/brier_reward": 0.623296320438385,
"rewards/confidence_one_or_zero": 0.1484375,
"rewards/format_reward": 0.98291015625,
"rewards/mean_confidence_reward": 0.7346706986427307,
"step": 15
},
{
"epoch": 0.192,
"eval_completions/clipped_ratio": 0.00390625,
"eval_completions/max_length": 675.0,
"eval_completions/max_terminated_length": 347.5,
"eval_completions/mean_length": 134.63638305664062,
"eval_completions/mean_terminated_length": 131.14013671875,
"eval_completions/min_length": 56.0,
"eval_completions/min_terminated_length": 56.0,
"eval_loss": 0.0,
"eval_num_tokens": 51033389.0,
"eval_reward": 0.9371011257171631,
"eval_reward_std": 0.3611362501978874,
"eval_rewards/accuracy_reward": 0.30078125,
"eval_rewards/brier_reward": 0.5890443176031113,
"eval_rewards/confidence_one_or_zero": 0.12109375,
"eval_rewards/format_reward": 0.984375,
"eval_rewards/mean_confidence_reward": 0.712636724114418,
"eval_runtime": 50.8118,
"eval_samples_per_second": 9.84,
"eval_steps_per_second": 0.079,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00244140625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 865.0,
"completions/mean_length": 136.1689453125,
"completions/mean_terminated_length": 133.99607849121094,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.2048,
"grad_norm": 0.015799518674612045,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 54328887.0,
"reward": 0.9970545768737793,
"reward_std": 0.23557817935943604,
"rewards/accuracy_reward": 0.38330078125,
"rewards/brier_reward": 0.6230137348175049,
"rewards/confidence_one_or_zero": 0.13623046875,
"rewards/format_reward": 0.98779296875,
"rewards/mean_confidence_reward": 0.7369694113731384,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00244140625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 586.0,
"completions/mean_length": 130.96484375,
"completions/mean_terminated_length": 128.7792510986328,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.2176,
"grad_norm": 0.011801626533269882,
"learning_rate": 1e-06,
"loss": 0.0024,
"num_tokens": 57602287.0,
"reward": 0.9894813299179077,
"reward_std": 0.22355304658412933,
"rewards/accuracy_reward": 0.35205078125,
"rewards/brier_reward": 0.6386289596557617,
"rewards/confidence_one_or_zero": 0.13720703125,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.7070361375808716,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00244140625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 684.0,
"completions/mean_length": 133.64404296875,
"completions/mean_terminated_length": 131.46499633789062,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.2304,
"grad_norm": 0.003088222583755851,
"learning_rate": 1e-06,
"loss": 0.0024,
"num_tokens": 60943718.0,
"reward": 1.0247716903686523,
"reward_std": 0.2183222770690918,
"rewards/accuracy_reward": 0.39892578125,
"rewards/brier_reward": 0.6613582372665405,
"rewards/confidence_one_or_zero": 0.11376953125,
"rewards/format_reward": 0.9892578125,
"rewards/mean_confidence_reward": 0.7037431001663208,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.001953125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 684.0,
"completions/mean_length": 130.42626953125,
"completions/mean_terminated_length": 128.67759704589844,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.2432,
"grad_norm": 0.0030834106728434563,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 64218495.0,
"reward": 1.0376394987106323,
"reward_std": 0.22437486052513123,
"rewards/accuracy_reward": 0.40869140625,
"rewards/brier_reward": 0.6768399477005005,
"rewards/confidence_one_or_zero": 0.10986328125,
"rewards/format_reward": 0.98974609375,
"rewards/mean_confidence_reward": 0.6858630180358887,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 605.0,
"completions/mean_length": 129.1884765625,
"completions/mean_terminated_length": 128.31378173828125,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"epoch": 0.256,
"grad_norm": 0.0019296990940347314,
"learning_rate": 1e-06,
"loss": -0.0007,
"num_tokens": 67508849.0,
"reward": 1.0304187536239624,
"reward_std": 0.20604413747787476,
"rewards/accuracy_reward": 0.39013671875,
"rewards/brier_reward": 0.6755821108818054,
"rewards/confidence_one_or_zero": 0.0966796875,
"rewards/format_reward": 0.9951171875,
"rewards/mean_confidence_reward": 0.6770117282867432,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 919.0,
"completions/mean_length": 134.54052734375,
"completions/mean_terminated_length": 134.1060028076172,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.2688,
"grad_norm": 0.0030771668534725904,
"learning_rate": 1e-06,
"loss": -0.0013,
"num_tokens": 70787884.0,
"reward": 1.0380725860595703,
"reward_std": 0.197679340839386,
"rewards/accuracy_reward": 0.39208984375,
"rewards/brier_reward": 0.6879599690437317,
"rewards/confidence_one_or_zero": 0.0791015625,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.6546210646629333,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00146484375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 766.0,
"completions/mean_length": 131.81640625,
"completions/mean_terminated_length": 130.50758361816406,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"epoch": 0.2816,
"grad_norm": 0.001605757512152195,
"learning_rate": 1e-06,
"loss": 0.0016,
"num_tokens": 74049700.0,
"reward": 1.0163131952285767,
"reward_std": 0.19691747426986694,
"rewards/accuracy_reward": 0.35302734375,
"rewards/brier_reward": 0.6854572296142578,
"rewards/confidence_one_or_zero": 0.076171875,
"rewards/format_reward": 0.994140625,
"rewards/mean_confidence_reward": 0.6287341713905334,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 470.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 130.5732421875,
"completions/mean_terminated_length": 130.5732421875,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"epoch": 0.2944,
"grad_norm": 0.0016451003029942513,
"learning_rate": 1e-06,
"loss": -0.0007,
"num_tokens": 77286914.0,
"reward": 1.052234411239624,
"reward_std": 0.19646784663200378,
"rewards/accuracy_reward": 0.39404296875,
"rewards/brier_reward": 0.7128661274909973,
"rewards/confidence_one_or_zero": 0.06005859375,
"rewards/format_reward": 0.99755859375,
"rewards/mean_confidence_reward": 0.6058691143989563,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 552.0,
"completions/mean_length": 135.2314453125,
"completions/mean_terminated_length": 134.36265563964844,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.3072,
"grad_norm": 0.0020764193031936884,
"learning_rate": 1e-06,
"loss": 0.0021,
"num_tokens": 80575708.0,
"reward": 1.030700922012329,
"reward_std": 0.17984464764595032,
"rewards/accuracy_reward": 0.357421875,
"rewards/brier_reward": 0.7069083452224731,
"rewards/confidence_one_or_zero": 0.05517578125,
"rewards/format_reward": 0.9970703125,
"rewards/mean_confidence_reward": 0.5960919857025146,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 811.0,
"completions/mean_length": 135.84375,
"completions/mean_terminated_length": 134.97555541992188,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.32,
"grad_norm": 0.0012462260201573372,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 83862556.0,
"reward": 1.0884177684783936,
"reward_std": 0.16984757781028748,
"rewards/accuracy_reward": 0.43359375,
"rewards/brier_reward": 0.7461701035499573,
"rewards/confidence_one_or_zero": 0.060546875,
"rewards/format_reward": 0.9970703125,
"rewards/mean_confidence_reward": 0.5713028907775879,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00146484375,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 138.23193359375,
"completions/mean_terminated_length": 136.93251037597656,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.3328,
"grad_norm": 0.0014977873070165515,
"learning_rate": 1e-06,
"loss": 0.0025,
"num_tokens": 87091063.0,
"reward": 1.076505184173584,
"reward_std": 0.16160905361175537,
"rewards/accuracy_reward": 0.4140625,
"rewards/brier_reward": 0.741388201713562,
"rewards/confidence_one_or_zero": 0.0517578125,
"rewards/format_reward": 0.99755859375,
"rewards/mean_confidence_reward": 0.5425801277160645,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 836.0,
"completions/mean_length": 137.69921875,
"completions/mean_terminated_length": 136.8328399658203,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.3456,
"grad_norm": 0.0013432031264528632,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 90390975.0,
"reward": 1.0520920753479004,
"reward_std": 0.1620258092880249,
"rewards/accuracy_reward": 0.36328125,
"rewards/brier_reward": 0.7428549528121948,
"rewards/confidence_one_or_zero": 0.048828125,
"rewards/format_reward": 0.998046875,
"rewards/mean_confidence_reward": 0.5067415237426758,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 522.0,
"completions/mean_length": 139.10302734375,
"completions/mean_terminated_length": 138.6707305908203,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.3584,
"grad_norm": 0.001807131338864565,
"learning_rate": 1e-06,
"loss": 0.0007,
"num_tokens": 93687954.0,
"reward": 1.068045735359192,
"reward_std": 0.14087209105491638,
"rewards/accuracy_reward": 0.3828125,
"rewards/brier_reward": 0.7547427415847778,
"rewards/confidence_one_or_zero": 0.04541015625,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.4919547438621521,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 370.0,
"completions/mean_length": 144.34716796875,
"completions/mean_terminated_length": 143.9174346923828,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.3712,
"grad_norm": 0.0023208060301840305,
"learning_rate": 1e-06,
"loss": 0.0012,
"num_tokens": 97022633.0,
"reward": 1.065900206565857,
"reward_std": 0.13991808891296387,
"rewards/accuracy_reward": 0.37255859375,
"rewards/brier_reward": 0.7607055902481079,
"rewards/confidence_one_or_zero": 0.05712890625,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.4605600833892822,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 589.0,
"completions/max_terminated_length": 589.0,
"completions/mean_length": 143.078125,
"completions/mean_terminated_length": 143.078125,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"epoch": 0.384,
"grad_norm": 0.0033083283342421055,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 100277401.0,
"reward": 1.080049753189087,
"reward_std": 0.13700106739997864,
"rewards/accuracy_reward": 0.396484375,
"rewards/brier_reward": 0.7650789618492126,
"rewards/confidence_one_or_zero": 0.0498046875,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.4552270770072937,
"step": 30
},
{
"epoch": 0.384,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 353.25,
"eval_completions/max_terminated_length": 353.25,
"eval_completions/mean_length": 144.8663787841797,
"eval_completions/mean_terminated_length": 144.8663787841797,
"eval_completions/min_length": 73.75,
"eval_completions/min_terminated_length": 73.75,
"eval_loss": 0.0,
"eval_num_tokens": 100277401.0,
"eval_reward": 1.0514324307441711,
"eval_reward_std": 0.226553276181221,
"eval_rewards/accuracy_reward": 0.330078125,
"eval_rewards/brier_reward": 0.7727857530117035,
"eval_rewards/confidence_one_or_zero": 0.0546875,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.4388085976243019,
"eval_runtime": 36.7184,
"eval_samples_per_second": 13.617,
"eval_steps_per_second": 0.109,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 397.0,
"completions/mean_length": 143.3134765625,
"completions/mean_terminated_length": 142.88323974609375,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.3968,
"grad_norm": 0.0009655402973294258,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 103574067.0,
"reward": 1.0772579908370972,
"reward_std": 0.13317140936851501,
"rewards/accuracy_reward": 0.3837890625,
"rewards/brier_reward": 0.7721908092498779,
"rewards/confidence_one_or_zero": 0.05224609375,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.4339550733566284,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 523.0,
"completions/mean_length": 147.052734375,
"completions/mean_terminated_length": 146.62432861328125,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.4096,
"grad_norm": 0.0014063868438825011,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 106852399.0,
"reward": 1.0804073810577393,
"reward_std": 0.12738269567489624,
"rewards/accuracy_reward": 0.39697265625,
"rewards/brier_reward": 0.7653061151504517,
"rewards/confidence_one_or_zero": 0.04296875,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.42521679401397705,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 426.0,
"completions/mean_length": 144.61669921875,
"completions/mean_terminated_length": 144.18710327148438,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.4224,
"grad_norm": 0.0014064498245716095,
"learning_rate": 1e-06,
"loss": -0.0,
"num_tokens": 110157062.0,
"reward": 1.0807609558105469,
"reward_std": 0.1330416202545166,
"rewards/accuracy_reward": 0.39404296875,
"rewards/brier_reward": 0.7699194550514221,
"rewards/confidence_one_or_zero": 0.048828125,
"rewards/format_reward": 0.99755859375,
"rewards/mean_confidence_reward": 0.4001578688621521,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 361.0,
"completions/max_terminated_length": 361.0,
"completions/mean_length": 150.60107421875,
"completions/mean_terminated_length": 150.60107421875,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.4352,
"grad_norm": 0.0029456529300659895,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 113450349.0,
"reward": 1.104508876800537,
"reward_std": 0.11853398382663727,
"rewards/accuracy_reward": 0.43408203125,
"rewards/brier_reward": 0.7754232287406921,
"rewards/confidence_one_or_zero": 0.04541015625,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.4032275378704071,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 432.0,
"completions/max_terminated_length": 432.0,
"completions/mean_length": 151.95751953125,
"completions/mean_terminated_length": 151.95751953125,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"epoch": 0.448,
"grad_norm": 0.0026214425452053547,
"learning_rate": 1e-06,
"loss": -0.0004,
"num_tokens": 116759078.0,
"reward": 1.0844058990478516,
"reward_std": 0.11485770344734192,
"rewards/accuracy_reward": 0.408203125,
"rewards/brier_reward": 0.7615846395492554,
"rewards/confidence_one_or_zero": 0.044921875,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.38147109746932983,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 505.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 153.82666015625,
"completions/mean_terminated_length": 153.82666015625,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.4608,
"grad_norm": 0.0012385396985337138,
"learning_rate": 1e-06,
"loss": -0.0007,
"num_tokens": 120127755.0,
"reward": 1.0605785846710205,
"reward_std": 0.10509373247623444,
"rewards/accuracy_reward": 0.3427734375,
"rewards/brier_reward": 0.7788711190223694,
"rewards/confidence_one_or_zero": 0.04248046875,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.3768359422683716,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 590.0,
"completions/mean_length": 153.3173828125,
"completions/mean_terminated_length": 152.8920440673828,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.4736,
"grad_norm": 0.002045744564384222,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 123451997.0,
"reward": 1.0735026597976685,
"reward_std": 0.10574886202812195,
"rewards/accuracy_reward": 0.37060546875,
"rewards/brier_reward": 0.7773755788803101,
"rewards/confidence_one_or_zero": 0.0498046875,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.3696533143520355,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 633.0,
"completions/max_terminated_length": 633.0,
"completions/mean_length": 152.341796875,
"completions/mean_terminated_length": 152.341796875,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.4864,
"grad_norm": 0.003814885625615716,
"learning_rate": 1e-06,
"loss": -0.0004,
"num_tokens": 126776897.0,
"reward": 1.097827434539795,
"reward_std": 0.10771030187606812,
"rewards/accuracy_reward": 0.439453125,
"rewards/brier_reward": 0.7566891312599182,
"rewards/confidence_one_or_zero": 0.05615234375,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.34937989711761475,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 411.0,
"completions/max_terminated_length": 411.0,
"completions/mean_length": 156.4228515625,
"completions/mean_terminated_length": 156.4228515625,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.4992,
"grad_norm": 0.0009488159557804465,
"learning_rate": 1e-06,
"loss": -0.0005,
"num_tokens": 130164187.0,
"reward": 1.098426103591919,
"reward_std": 0.1080411821603775,
"rewards/accuracy_reward": 0.44775390625,
"rewards/brier_reward": 0.7495858073234558,
"rewards/confidence_one_or_zero": 0.04248046875,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.35322752594947815,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 375.0,
"completions/max_terminated_length": 375.0,
"completions/mean_length": 155.7001953125,
"completions/mean_terminated_length": 155.7001953125,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.512,
"grad_norm": 0.0009362637065351009,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 133507421.0,
"reward": 1.0903565883636475,
"reward_std": 0.10990739613771439,
"rewards/accuracy_reward": 0.41455078125,
"rewards/brier_reward": 0.7676265239715576,
"rewards/confidence_one_or_zero": 0.03759765625,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.3521386981010437,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 608.0,
"completions/max_terminated_length": 608.0,
"completions/mean_length": 155.94287109375,
"completions/mean_terminated_length": 155.94287109375,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.5248,
"grad_norm": 0.0018933570245280862,
"learning_rate": 1e-06,
"loss": -0.0003,
"num_tokens": 136822400.0,
"reward": 1.0992754697799683,
"reward_std": 0.10722452402114868,
"rewards/accuracy_reward": 0.42822265625,
"rewards/brier_reward": 0.7717922925949097,
"rewards/confidence_one_or_zero": 0.05810546875,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.35319823026657104,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 413.0,
"completions/max_terminated_length": 413.0,
"completions/mean_length": 159.2255859375,
"completions/mean_terminated_length": 159.2255859375,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.5376,
"grad_norm": 0.0014252394903451204,
"learning_rate": 1e-06,
"loss": -0.0,
"num_tokens": 140161310.0,
"reward": 1.1002483367919922,
"reward_std": 0.10786743462085724,
"rewards/accuracy_reward": 0.43994140625,
"rewards/brier_reward": 0.7615311145782471,
"rewards/confidence_one_or_zero": 0.04345703125,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.3506225645542145,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 499.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 160.8203125,
"completions/mean_terminated_length": 160.8203125,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.5504,
"grad_norm": 0.0009207865223288536,
"learning_rate": 1e-06,
"loss": -0.0002,
"num_tokens": 143509286.0,
"reward": 1.1000754833221436,
"reward_std": 0.11076341569423676,
"rewards/accuracy_reward": 0.44384765625,
"rewards/brier_reward": 0.7572791576385498,
"rewards/confidence_one_or_zero": 0.03955078125,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.35358887910842896,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 467.0,
"completions/max_terminated_length": 467.0,
"completions/mean_length": 160.43994140625,
"completions/mean_terminated_length": 160.43994140625,
"completions/min_length": 72.0,
"completions/min_terminated_length": 72.0,
"epoch": 0.5632,
"grad_norm": 0.0006189781124703586,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 146836963.0,
"reward": 1.0809619426727295,
"reward_std": 0.09901401400566101,
"rewards/accuracy_reward": 0.38720703125,
"rewards/brier_reward": 0.7761809825897217,
"rewards/confidence_one_or_zero": 0.04248046875,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.34800535440444946,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 413.0,
"completions/max_terminated_length": 413.0,
"completions/mean_length": 159.8115234375,
"completions/mean_terminated_length": 159.8115234375,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.576,
"grad_norm": 0.02215094119310379,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 150188409.0,
"reward": 1.0951708555221558,
"reward_std": 0.10278713703155518,
"rewards/accuracy_reward": 0.4228515625,
"rewards/brier_reward": 0.7679775953292847,
"rewards/confidence_one_or_zero": 0.033203125,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.35493797063827515,
"step": 45
},
{
"epoch": 0.576,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 287.75,
"eval_completions/max_terminated_length": 287.75,
"eval_completions/mean_length": 161.08263778686523,
"eval_completions/mean_terminated_length": 161.08263778686523,
"eval_completions/min_length": 85.5,
"eval_completions/min_terminated_length": 85.5,
"eval_loss": 0.0,
"eval_num_tokens": 150188409.0,
"eval_reward": 1.0704601407051086,
"eval_reward_std": 0.18598725646734238,
"eval_rewards/accuracy_reward": 0.349609375,
"eval_rewards/brier_reward": 0.7913101464509964,
"eval_rewards/confidence_one_or_zero": 0.03125,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.36238280683755875,
"eval_runtime": 33.0054,
"eval_samples_per_second": 15.149,
"eval_steps_per_second": 0.121,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 478.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 159.986328125,
"completions/mean_terminated_length": 159.986328125,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.5888,
"grad_norm": 0.003660782240331173,
"learning_rate": 1e-06,
"loss": -0.0006,
"num_tokens": 153552477.0,
"reward": 1.095861792564392,
"reward_std": 0.0982513576745987,
"rewards/accuracy_reward": 0.4150390625,
"rewards/brier_reward": 0.7776603102684021,
"rewards/confidence_one_or_zero": 0.048828125,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.36439940333366394,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 443.0,
"completions/max_terminated_length": 443.0,
"completions/mean_length": 161.75146484375,
"completions/mean_terminated_length": 161.75146484375,
"completions/min_length": 72.0,
"completions/min_terminated_length": 72.0,
"epoch": 0.6016,
"grad_norm": 0.004005622584372759,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 156878152.0,
"reward": 1.1043840646743774,
"reward_std": 0.10187876969575882,
"rewards/accuracy_reward": 0.43408203125,
"rewards/brier_reward": 0.7761501669883728,
"rewards/confidence_one_or_zero": 0.03759765625,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.3677002191543579,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 361.0,
"completions/max_terminated_length": 361.0,
"completions/mean_length": 163.7666015625,
"completions/mean_terminated_length": 163.7666015625,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.6144,
"grad_norm": 0.0013804073678329587,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 160274850.0,
"reward": 1.1110379695892334,
"reward_std": 0.09990298002958298,
"rewards/accuracy_reward": 0.45361328125,
"rewards/brier_reward": 0.7684618234634399,
"rewards/confidence_one_or_zero": 0.0341796875,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.3786962926387787,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 571.0,
"completions/max_terminated_length": 571.0,
"completions/mean_length": 165.44677734375,
"completions/mean_terminated_length": 165.44677734375,
"completions/min_length": 80.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.6272,
"grad_norm": 0.0006175300804898143,
"learning_rate": 1e-06,
"loss": -0.0005,
"num_tokens": 163673933.0,
"reward": 1.1008734703063965,
"reward_std": 0.1041068434715271,
"rewards/accuracy_reward": 0.43603515625,
"rewards/brier_reward": 0.7657108902931213,
"rewards/confidence_one_or_zero": 0.0205078125,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.38051414489746094,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 553.0,
"completions/max_terminated_length": 553.0,
"completions/mean_length": 166.4296875,
"completions/mean_terminated_length": 166.4296875,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.64,
"grad_norm": 0.0007595557253807783,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 167075853.0,
"reward": 1.119231104850769,
"reward_std": 0.10055085271596909,
"rewards/accuracy_reward": 0.4638671875,
"rewards/brier_reward": 0.7750825881958008,
"rewards/confidence_one_or_zero": 0.02685546875,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.40596190094947815,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 452.0,
"completions/max_terminated_length": 452.0,
"completions/mean_length": 166.87060546875,
"completions/mean_terminated_length": 166.87060546875,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.6528,
"grad_norm": 0.0037527712993323803,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 170378204.0,
"reward": 1.0836341381072998,
"reward_std": 0.10692138969898224,
"rewards/accuracy_reward": 0.39794921875,
"rewards/brier_reward": 0.7717597484588623,
"rewards/confidence_one_or_zero": 0.02587890625,
"rewards/format_reward": 0.99755859375,
"rewards/mean_confidence_reward": 0.4016748070716858,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 527.0,
"completions/max_terminated_length": 527.0,
"completions/mean_length": 165.9951171875,
"completions/mean_terminated_length": 165.9951171875,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.6656,
"grad_norm": 0.0017999252304434776,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 173709242.0,
"reward": 1.1069138050079346,
"reward_std": 0.10350505262613297,
"rewards/accuracy_reward": 0.4482421875,
"rewards/brier_reward": 0.7655847072601318,
"rewards/confidence_one_or_zero": 0.02392578125,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.4061328172683716,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.0,
"completions/max_terminated_length": 447.0,
"completions/mean_length": 167.8916015625,
"completions/mean_terminated_length": 167.8916015625,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.6784,
"grad_norm": 0.002350540366023779,
"learning_rate": 1e-06,
"loss": -0.0,
"num_tokens": 177022084.0,
"reward": 1.08831787109375,
"reward_std": 0.10483807325363159,
"rewards/accuracy_reward": 0.39990234375,
"rewards/brier_reward": 0.7767325639724731,
"rewards/confidence_one_or_zero": 0.02294921875,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.4102539122104645,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 601.0,
"completions/max_terminated_length": 601.0,
"completions/mean_length": 170.22705078125,
"completions/mean_terminated_length": 170.22705078125,
"completions/min_length": 81.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.6912,
"grad_norm": 0.0024247784167528152,
"learning_rate": 1e-06,
"loss": -0.0002,
"num_tokens": 180378381.0,
"reward": 1.0937700271606445,
"reward_std": 0.11120226234197617,
"rewards/accuracy_reward": 0.41552734375,
"rewards/brier_reward": 0.7725003957748413,
"rewards/confidence_one_or_zero": 0.01171875,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.41514649987220764,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 481.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 171.05126953125,
"completions/mean_terminated_length": 171.05126953125,
"completions/min_length": 82.0,
"completions/min_terminated_length": 82.0,
"epoch": 0.704,
"grad_norm": 0.0025645701680332422,
"learning_rate": 1e-06,
"loss": -0.0002,
"num_tokens": 183697854.0,
"reward": 1.1110807657241821,
"reward_std": 0.11133909225463867,
"rewards/accuracy_reward": 0.45947265625,
"rewards/brier_reward": 0.7631762027740479,
"rewards/confidence_one_or_zero": 0.0224609375,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.42967772483825684,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 569.0,
"completions/max_terminated_length": 569.0,
"completions/mean_length": 171.7705078125,
"completions/mean_terminated_length": 171.7705078125,
"completions/min_length": 80.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.7168,
"grad_norm": 0.0012854809174314141,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 187083480.0,
"reward": 1.1149908304214478,
"reward_std": 0.11459929496049881,
"rewards/accuracy_reward": 0.4580078125,
"rewards/brier_reward": 0.7744144797325134,
"rewards/confidence_one_or_zero": 0.0146484375,
"rewards/format_reward": 0.99755859375,
"rewards/mean_confidence_reward": 0.4292285144329071,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 503.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 171.90771484375,
"completions/mean_terminated_length": 171.90771484375,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.7296,
"grad_norm": 0.0018713475437834859,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 190385139.0,
"reward": 1.1170523166656494,
"reward_std": 0.11444520950317383,
"rewards/accuracy_reward": 0.4619140625,
"rewards/brier_reward": 0.772189736366272,
"rewards/confidence_one_or_zero": 0.02001953125,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.4356445372104645,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 170.96044921875,
"completions/mean_terminated_length": 170.96044921875,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.7424,
"grad_norm": 0.0026220239233225584,
"learning_rate": 1e-06,
"loss": -0.0008,
"num_tokens": 193783130.0,
"reward": 1.1173450946807861,
"reward_std": 0.11841192841529846,
"rewards/accuracy_reward": 0.46337890625,
"rewards/brier_reward": 0.7713105082511902,
"rewards/confidence_one_or_zero": 0.01416015625,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.4385010004043579,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 364.0,
"completions/max_terminated_length": 364.0,
"completions/mean_length": 173.064453125,
"completions/mean_terminated_length": 173.064453125,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.7552,
"grad_norm": 0.0012924442999064922,
"learning_rate": 1e-06,
"loss": 0.0012,
"num_tokens": 197140494.0,
"reward": 1.105983018875122,
"reward_std": 0.107911616563797,
"rewards/accuracy_reward": 0.43505859375,
"rewards/brier_reward": 0.7773948907852173,
"rewards/confidence_one_or_zero": 0.015625,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.44759440422058105,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 757.0,
"completions/mean_length": 176.484375,
"completions/mean_terminated_length": 176.07034301757812,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.768,
"grad_norm": 0.0028388705104589462,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 200495046.0,
"reward": 1.110207438468933,
"reward_std": 0.11490217596292496,
"rewards/accuracy_reward": 0.44677734375,
"rewards/brier_reward": 0.7746132612228394,
"rewards/confidence_one_or_zero": 0.01171875,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.45084962248802185,
"step": 60
},
{
"epoch": 0.768,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 340.5,
"eval_completions/max_terminated_length": 340.5,
"eval_completions/mean_length": 177.17342376708984,
"eval_completions/mean_terminated_length": 177.17342376708984,
"eval_completions/min_length": 83.5,
"eval_completions/min_terminated_length": 83.5,
"eval_loss": 0.0,
"eval_num_tokens": 200495046.0,
"eval_reward": 1.060145229101181,
"eval_reward_std": 0.23021632805466652,
"eval_rewards/accuracy_reward": 0.3515625,
"eval_rewards/brier_reward": 0.7706802636384964,
"eval_rewards/confidence_one_or_zero": 0.013671875,
"eval_rewards/format_reward": 0.998046875,
"eval_rewards/mean_confidence_reward": 0.44810547679662704,
"eval_runtime": 35.5968,
"eval_samples_per_second": 14.046,
"eval_steps_per_second": 0.112,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 473.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 175.306640625,
"completions/mean_terminated_length": 175.306640625,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.7808,
"grad_norm": 0.0019142339006066322,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 203882754.0,
"reward": 1.1105574369430542,
"reward_std": 0.1143236756324768,
"rewards/accuracy_reward": 0.44970703125,
"rewards/brier_reward": 0.771406888961792,
"rewards/confidence_one_or_zero": 0.01416015625,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.44986817240715027,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 479.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 175.62744140625,
"completions/mean_terminated_length": 175.62744140625,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.7936,
"grad_norm": 0.0028708542231470346,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 207246823.0,
"reward": 1.115567922592163,
"reward_std": 0.11952622979879379,
"rewards/accuracy_reward": 0.4619140625,
"rewards/brier_reward": 0.770685613155365,
"rewards/confidence_one_or_zero": 0.01708984375,
"rewards/format_reward": 0.99853515625,
"rewards/mean_confidence_reward": 0.4633447527885437,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 388.0,
"completions/max_terminated_length": 388.0,
"completions/mean_length": 171.20751953125,
"completions/mean_terminated_length": 171.20751953125,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.8064,
"grad_norm": 0.013748224824666977,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 210614152.0,
"reward": 1.1377573013305664,
"reward_std": 0.10060383379459381,
"rewards/accuracy_reward": 0.50634765625,
"rewards/brier_reward": 0.769166111946106,
"rewards/confidence_one_or_zero": 0.01708984375,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.47361326217651367,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 546.0,
"completions/max_terminated_length": 546.0,
"completions/mean_length": 173.845703125,
"completions/mean_terminated_length": 173.845703125,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"epoch": 0.8192,
"grad_norm": 0.0016836629947647452,
"learning_rate": 1e-06,
"loss": 0.0014,
"num_tokens": 213986532.0,
"reward": 1.1204241514205933,
"reward_std": 0.12249487638473511,
"rewards/accuracy_reward": 0.466796875,
"rewards/brier_reward": 0.7740505933761597,
"rewards/confidence_one_or_zero": 0.01806640625,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.4648388624191284,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 401.0,
"completions/max_terminated_length": 401.0,
"completions/mean_length": 175.62158203125,
"completions/mean_terminated_length": 175.62158203125,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"epoch": 0.832,
"grad_norm": 0.008333772420883179,
"learning_rate": 1e-06,
"loss": -0.0006,
"num_tokens": 217353213.0,
"reward": 1.1017959117889404,
"reward_std": 0.11205719411373138,
"rewards/accuracy_reward": 0.43310546875,
"rewards/brier_reward": 0.7714620232582092,
"rewards/confidence_one_or_zero": 0.0166015625,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.46193361282348633,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 442.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 174.36376953125,
"completions/mean_terminated_length": 174.36376953125,
"completions/min_length": 79.0,
"completions/min_terminated_length": 79.0,
"epoch": 0.8448,
"grad_norm": 0.0011087502352893353,
"learning_rate": 1e-06,
"loss": -0.0003,
"num_tokens": 220702790.0,
"reward": 1.1077888011932373,
"reward_std": 0.11732746660709381,
"rewards/accuracy_reward": 0.443359375,
"rewards/brier_reward": 0.773193895816803,
"rewards/confidence_one_or_zero": 0.0146484375,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.4771631062030792,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 563.0,
"completions/max_terminated_length": 563.0,
"completions/mean_length": 180.34814453125,
"completions/mean_terminated_length": 180.34814453125,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.8576,
"grad_norm": 0.0014937082305550575,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 224068031.0,
"reward": 1.111690640449524,
"reward_std": 0.10702188313007355,
"rewards/accuracy_reward": 0.44921875,
"rewards/brier_reward": 0.774161696434021,
"rewards/confidence_one_or_zero": 0.0107421875,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.47103172540664673,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 496.0,
"completions/max_terminated_length": 496.0,
"completions/mean_length": 179.4482421875,
"completions/mean_terminated_length": 179.4482421875,
"completions/min_length": 80.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.8704,
"grad_norm": 0.003253382397815585,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 227500213.0,
"reward": 1.1289840936660767,
"reward_std": 0.11423557996749878,
"rewards/accuracy_reward": 0.4765625,
"rewards/brier_reward": 0.7823812365531921,
"rewards/confidence_one_or_zero": 0.01513671875,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.4756738245487213,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 506.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 177.3544921875,
"completions/mean_terminated_length": 177.3544921875,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.8832,
"grad_norm": 0.000763367279432714,
"learning_rate": 1e-06,
"loss": -0.0004,
"num_tokens": 230841907.0,
"reward": 1.0981295108795166,
"reward_std": 0.11358843743801117,
"rewards/accuracy_reward": 0.41943359375,
"rewards/brier_reward": 0.7768245935440063,
"rewards/confidence_one_or_zero": 0.015625,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.4759277403354645,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 411.0,
"completions/mean_length": 178.068359375,
"completions/mean_terminated_length": 177.6551055908203,
"completions/min_length": 96.0,
"completions/min_terminated_length": 96.0,
"epoch": 0.896,
"grad_norm": 0.000891282397788018,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 234239855.0,
"reward": 1.1204962730407715,
"reward_std": 0.11121779680252075,
"rewards/accuracy_reward": 0.4697265625,
"rewards/brier_reward": 0.7717532515525818,
"rewards/confidence_one_or_zero": 0.01318359375,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.48045408725738525,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 453.0,
"completions/max_terminated_length": 453.0,
"completions/mean_length": 181.03125,
"completions/mean_terminated_length": 181.03125,
"completions/min_length": 82.0,
"completions/min_terminated_length": 82.0,
"epoch": 0.9088,
"grad_norm": 0.0010501582873985171,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 237612871.0,
"reward": 1.1125749349594116,
"reward_std": 0.11690981686115265,
"rewards/accuracy_reward": 0.45654296875,
"rewards/brier_reward": 0.7695823907852173,
"rewards/confidence_one_or_zero": 0.01025390625,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.47194337844848633,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 564.0,
"completions/max_terminated_length": 564.0,
"completions/mean_length": 179.0400390625,
"completions/mean_terminated_length": 179.0400390625,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.9216,
"grad_norm": 0.0008443333790637553,
"learning_rate": 1e-06,
"loss": -0.0004,
"num_tokens": 241019905.0,
"reward": 1.1193747520446777,
"reward_std": 0.11691069602966309,
"rewards/accuracy_reward": 0.470703125,
"rewards/brier_reward": 0.7690220475196838,
"rewards/confidence_one_or_zero": 0.02294921875,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.46311524510383606,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 504.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 177.8017578125,
"completions/mean_terminated_length": 177.8017578125,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.9344,
"grad_norm": 0.000890648050699383,
"learning_rate": 1e-06,
"loss": -0.0004,
"num_tokens": 244393011.0,
"reward": 1.0996851921081543,
"reward_std": 0.1123179942369461,
"rewards/accuracy_reward": 0.41845703125,
"rewards/brier_reward": 0.7809123992919922,
"rewards/confidence_one_or_zero": 0.02197265625,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.4633203148841858,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 591.0,
"completions/max_terminated_length": 591.0,
"completions/mean_length": 179.6220703125,
"completions/mean_terminated_length": 179.6220703125,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.9472,
"grad_norm": 0.000775814289227128,
"learning_rate": 1e-06,
"loss": -0.0001,
"num_tokens": 247706293.0,
"reward": 1.115156650543213,
"reward_std": 0.10829775035381317,
"rewards/accuracy_reward": 0.45654296875,
"rewards/brier_reward": 0.7737695574760437,
"rewards/confidence_one_or_zero": 0.0185546875,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.4643896520137787,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 525.0,
"completions/max_terminated_length": 525.0,
"completions/mean_length": 180.197265625,
"completions/mean_terminated_length": 180.197265625,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.96,
"grad_norm": 0.0024549230001866817,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 251076793.0,
"reward": 1.1133463382720947,
"reward_std": 0.10567133873701096,
"rewards/accuracy_reward": 0.4453125,
"rewards/brier_reward": 0.781867504119873,
"rewards/confidence_one_or_zero": 0.01123046875,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.481499046087265,
"step": 75
},
{
"epoch": 0.96,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 335.25,
"eval_completions/max_terminated_length": 335.25,
"eval_completions/mean_length": 180.3135108947754,
"eval_completions/mean_terminated_length": 180.3135108947754,
"eval_completions/min_length": 106.0,
"eval_completions/min_terminated_length": 106.0,
"eval_loss": 0.0,
"eval_num_tokens": 251076793.0,
"eval_reward": 1.0895682871341705,
"eval_reward_std": 0.24639935791492462,
"eval_rewards/accuracy_reward": 0.396484375,
"eval_rewards/brier_reward": 0.7826513648033142,
"eval_rewards/confidence_one_or_zero": 0.009765625,
"eval_rewards/format_reward": 1.0,
"eval_rewards/mean_confidence_reward": 0.46923828125,
"eval_runtime": 35.4002,
"eval_samples_per_second": 14.124,
"eval_steps_per_second": 0.113,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 534.0,
"completions/mean_length": 182.37158203125,
"completions/mean_terminated_length": 181.96043395996094,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.9728,
"grad_norm": 0.0009286075364798307,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 254430658.0,
"reward": 1.1125905513763428,
"reward_std": 0.11536484956741333,
"rewards/accuracy_reward": 0.45068359375,
"rewards/brier_reward": 0.7749849557876587,
"rewards/confidence_one_or_zero": 0.015625,
"rewards/format_reward": 0.99951171875,
"rewards/mean_confidence_reward": 0.4785693287849426,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 419.0,
"completions/max_terminated_length": 419.0,
"completions/mean_length": 181.07470703125,
"completions/mean_terminated_length": 181.07470703125,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.9856,
"grad_norm": 0.0007356080459430814,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 257823147.0,
"reward": 1.113877773284912,
"reward_std": 0.1039476990699768,
"rewards/accuracy_reward": 0.44091796875,
"rewards/brier_reward": 0.7868366837501526,
"rewards/confidence_one_or_zero": 0.02685546875,
"rewards/format_reward": 1.0,
"rewards/mean_confidence_reward": 0.4647803008556366,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0006377551020407823,
"completions/max_length": 1024.0,
"completions/max_terminated_length": 371.0,
"completions/mean_length": 179.0899200439453,
"completions/mean_terminated_length": 178.5507354736328,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.9984,
"grad_norm": 0.008926053531467915,
"learning_rate": 1e-06,
"loss": -0.0,
"num_tokens": 261189064.0,
"reward": 1.1190553903579712,
"reward_std": 0.1117812842130661,
"rewards/accuracy_reward": 0.466796875,
"rewards/brier_reward": 0.772289514541626,
"rewards/confidence_one_or_zero": 0.01611328125,
"rewards/format_reward": 0.9990234375,
"rewards/mean_confidence_reward": 0.4710400402545929,
"step": 78
},
{
"epoch": 0.9984,
"step": 78,
"total_flos": 0.0,
"train_loss": 0.006661382568404989,
"train_runtime": 48105.2196,
"train_samples_per_second": 0.416,
"train_steps_per_second": 0.002
}
],
"logging_steps": 1,
"max_steps": 78,
"num_input_tokens_seen": 261189064,
"num_train_epochs": 1,
"save_steps": 30,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}