Files
big-math-digits-v2-brier-ba…/trainer_state.json
ModelHub XC 27ea1c5523 初始化项目,由ModelHub XC社区提供模型
Model: mehuldamani/big-math-digits-v2-brier-base-tabc
Source: Original Platform
2026-06-18 23:17:23 +08:00

1262 lines
49 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.49919376007799904,
"eval_steps": 50,
"global_step": 208,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021875,
"completions/max_length": 4010.4,
"completions/max_terminated_length": 4010.4,
"completions/mean_length": 503.759375,
"completions/mean_terminated_length": 515.0101257324219,
"completions/min_length": 0.0,
"completions/min_terminated_length": 6.8,
"epoch": 0.011999850001874977,
"grad_norm": 0.016190193593502045,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.006,
"num_tokens": 8917516.0,
"reward": 0.6058741450309754,
"reward_std": 0.534743320941925,
"rewards/accuracy_reward": 0.26883680522441866,
"rewards/brier_reward": 0.3217940092086792,
"rewards/confidence_one_or_zero": 0.34730902314186096,
"rewards/format_reward": 0.6210937380790711,
"rewards/mean_confidence_reward": 0.8363143563270569,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009114583333333325,
"completions/max_length": 3499.0,
"completions/max_terminated_length": 3499.0,
"completions/mean_length": 406.26458740234375,
"completions/mean_terminated_length": 410.0914794921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 25.2,
"epoch": 0.023999700003749954,
"grad_norm": 0.005708878394216299,
"learning_rate": 4.5454545454545455e-06,
"loss": -0.0094,
"num_tokens": 16680404.0,
"reward": 0.8901966333389282,
"reward_std": 0.4035582482814789,
"rewards/accuracy_reward": 0.37309027314186094,
"rewards/brier_reward": 0.4747287094593048,
"rewards/confidence_one_or_zero": 0.2348090261220932,
"rewards/format_reward": 0.9325520753860473,
"rewards/mean_confidence_reward": 0.8743563175201416,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01293402777777779,
"completions/max_length": 3835.0,
"completions/max_terminated_length": 3835.0,
"completions/mean_length": 452.3516540527344,
"completions/mean_terminated_length": 458.4701354980469,
"completions/min_length": 0.0,
"completions/min_terminated_length": 91.4,
"epoch": 0.03599955000562493,
"grad_norm": 0.0010099423816427588,
"learning_rate": 4.898477157360406e-06,
"loss": -0.0129,
"num_tokens": 24993479.0,
"reward": 1.066845965385437,
"reward_std": 0.29507095515728,
"rewards/accuracy_reward": 0.49340277910232544,
"rewards/brier_reward": 0.6558980584144593,
"rewards/confidence_one_or_zero": 0.02864583337213844,
"rewards/format_reward": 0.9843750119209289,
"rewards/mean_confidence_reward": 0.7727026224136353,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.018142361111111116,
"completions/max_length": 3412.8,
"completions/max_terminated_length": 3412.8,
"completions/mean_length": 560.2568603515625,
"completions/mean_terminated_length": 570.698095703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.6,
"epoch": 0.04799940000749991,
"grad_norm": 0.0005838912329636514,
"learning_rate": 4.771573604060914e-06,
"loss": -0.018,
"num_tokens": 34561334.0,
"reward": 1.166806983947754,
"reward_std": 0.2173929274082184,
"rewards/accuracy_reward": 0.6097222089767456,
"rewards/brier_reward": 0.7459280967712403,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9779513955116272,
"rewards/mean_confidence_reward": 0.6061267256736755,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.016753472222222232,
"completions/max_length": 3364.8,
"completions/max_terminated_length": 3364.8,
"completions/mean_length": 599.8819458007813,
"completions/mean_terminated_length": 610.2050415039063,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.05999925000937488,
"grad_norm": 0.0008427119464613497,
"learning_rate": 4.644670050761422e-06,
"loss": -0.0155,
"num_tokens": 44596422.0,
"reward": 1.193192458152771,
"reward_std": 0.18251443803310394,
"rewards/accuracy_reward": 0.6541666746139526,
"rewards/brier_reward": 0.7504361510276795,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9817708253860473,
"rewards/mean_confidence_reward": 0.5526909828186035,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.022048611111111095,
"completions/max_length": 3541.2,
"completions/max_terminated_length": 3541.2,
"completions/mean_length": 644.44619140625,
"completions/mean_terminated_length": 659.1096435546875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 189.6,
"epoch": 0.07199910001124986,
"grad_norm": 0.0007436223677359521,
"learning_rate": 4.5177664974619295e-06,
"loss": -0.0201,
"num_tokens": 55130362.0,
"reward": 1.1905469655990601,
"reward_std": 0.19011308550834655,
"rewards/accuracy_reward": 0.6495659589767456,
"rewards/brier_reward": 0.7538252711296082,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9776909708976745,
"rewards/mean_confidence_reward": 0.5784592151641845,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025260416666666653,
"completions/max_length": 3744.8,
"completions/max_terminated_length": 3744.8,
"completions/mean_length": 655.0560791015625,
"completions/mean_terminated_length": 672.1588012695313,
"completions/min_length": 0.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.08399895001312484,
"grad_norm": 0.0005305504309944808,
"learning_rate": 4.390862944162436e-06,
"loss": -0.0234,
"num_tokens": 65754048.0,
"reward": 1.1947672843933106,
"reward_std": 0.18359957337379457,
"rewards/accuracy_reward": 0.6561631917953491,
"rewards/brier_reward": 0.7592274188995362,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.974131953716278,
"rewards/mean_confidence_reward": 0.5979774236679077,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02230902777777779,
"completions/max_length": 3770.0,
"completions/max_terminated_length": 3770.0,
"completions/mean_length": 660.4337768554688,
"completions/mean_terminated_length": 675.5411987304688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 223.4,
"epoch": 0.09599880001499982,
"grad_norm": 0.0005542829167097807,
"learning_rate": 4.263959390862945e-06,
"loss": -0.0235,
"num_tokens": 76481765.0,
"reward": 1.2054712533950807,
"reward_std": 0.1817381888628006,
"rewards/accuracy_reward": 0.6667534708976746,
"rewards/brier_reward": 0.7666590690612793,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9775173544883728,
"rewards/mean_confidence_reward": 0.6286675214767456,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01597222222222221,
"completions/max_length": 3583.8,
"completions/max_terminated_length": 3583.8,
"completions/mean_length": 703.358935546875,
"completions/mean_terminated_length": 714.7557250976563,
"completions/min_length": 0.0,
"completions/min_terminated_length": 240.8,
"epoch": 0.1079986500168748,
"grad_norm": 0.0005376818589866161,
"learning_rate": 4.137055837563453e-06,
"loss": -0.0162,
"num_tokens": 87719724.0,
"reward": 1.2115298509597778,
"reward_std": 0.18161508142948152,
"rewards/accuracy_reward": 0.6661458253860474,
"rewards/brier_reward": 0.7729592084884643,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9839409708976745,
"rewards/mean_confidence_reward": 0.6761284708976746,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.018055555555555537,
"completions/max_length": 3400.2,
"completions/max_terminated_length": 3400.2,
"completions/mean_length": 739.211376953125,
"completions/mean_terminated_length": 752.7696411132813,
"completions/min_length": 0.0,
"completions/min_terminated_length": 249.8,
"epoch": 0.11999850001874976,
"grad_norm": 0.0006366602028720081,
"learning_rate": 4.0101522842639595e-06,
"loss": -0.019,
"num_tokens": 99333039.0,
"reward": 1.2029428243637086,
"reward_std": 0.18159882426261903,
"rewards/accuracy_reward": 0.6561631917953491,
"rewards/brier_reward": 0.7679378151893616,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9817708373069763,
"rewards/mean_confidence_reward": 0.6839114427566528,
"step": 50
},
{
"epoch": 0.11999850001874976,
"eval_completions/clipped_ratio": 0.021701388888888895,
"eval_completions/max_length": 2201.0,
"eval_completions/max_terminated_length": 2201.0,
"eval_completions/mean_length": 737.701416015625,
"eval_completions/mean_terminated_length": 753.9890950520834,
"eval_completions/min_length": 65.66666666666667,
"eval_completions/min_terminated_length": 332.6666666666667,
"eval_loss": 0.0,
"eval_num_tokens": 99333039.0,
"eval_reward": 1.199620544910431,
"eval_reward_std": 0.36773162086804706,
"eval_rewards/accuracy_reward": 0.6571180721124014,
"eval_rewards/brier_reward": 0.7664149304231008,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.9756944477558136,
"eval_rewards/mean_confidence_reward": 0.6796006560325623,
"eval_runtime": 321.4866,
"eval_samples_per_second": 3.111,
"eval_steps_per_second": 0.019,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02265625,
"completions/max_length": 3476.6,
"completions/max_terminated_length": 3476.6,
"completions/mean_length": 771.7972290039063,
"completions/mean_terminated_length": 789.8291259765625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 241.8,
"epoch": 0.13199835002062474,
"grad_norm": 0.0005807679262943566,
"learning_rate": 3.883248730964467e-06,
"loss": -0.0199,
"num_tokens": 111304719.0,
"reward": 1.207960844039917,
"reward_std": 0.18596434891223906,
"rewards/accuracy_reward": 0.6674479126930237,
"rewards/brier_reward": 0.7711163401603699,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.97734375,
"rewards/mean_confidence_reward": 0.6762413263320923,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02265625,
"completions/max_length": 3830.4,
"completions/max_terminated_length": 3830.4,
"completions/mean_length": 789.2795166015625,
"completions/mean_terminated_length": 807.6457641601562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 274.8,
"epoch": 0.14399820002249972,
"grad_norm": 0.0004933876334689558,
"learning_rate": 3.756345177664975e-06,
"loss": -0.023,
"num_tokens": 123493795.0,
"reward": 1.1914230823516845,
"reward_std": 0.19178390204906465,
"rewards/accuracy_reward": 0.642100703716278,
"rewards/brier_reward": 0.7638221859931946,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9769097208976746,
"rewards/mean_confidence_reward": 0.6678263902664184,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.020225694444444442,
"completions/max_length": 3798.4,
"completions/max_terminated_length": 3798.4,
"completions/mean_length": 747.4920166015625,
"completions/mean_terminated_length": 762.8623779296875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 239.8,
"epoch": 0.1559980500243747,
"grad_norm": 0.0006233630119822919,
"learning_rate": 3.629441624365482e-06,
"loss": -0.0202,
"num_tokens": 135198951.0,
"reward": 1.2134350299835206,
"reward_std": 0.17404497265815735,
"rewards/accuracy_reward": 0.6685763835906983,
"rewards/brier_reward": 0.7786794900894165,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9796006917953491,
"rewards/mean_confidence_reward": 0.6638238191604614,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.028559027777777767,
"completions/max_length": 3473.4,
"completions/max_terminated_length": 3473.4,
"completions/mean_length": 696.1581665039063,
"completions/mean_terminated_length": 716.5174926757812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 204.2,
"epoch": 0.16799790002624967,
"grad_norm": 0.000540408305823803,
"learning_rate": 3.5025380710659903e-06,
"loss": -0.0296,
"num_tokens": 146296837.0,
"reward": 1.195624089241028,
"reward_std": 0.1801719158887863,
"rewards/accuracy_reward": 0.6525173664093018,
"rewards/brier_reward": 0.7673637270927429,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9713541746139527,
"rewards/mean_confidence_reward": 0.640026044845581,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.018923611111111117,
"completions/max_length": 3295.0,
"completions/max_terminated_length": 3295.0,
"completions/mean_length": 678.0617309570313,
"completions/mean_terminated_length": 691.085107421875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 235.2,
"epoch": 0.17999775002812465,
"grad_norm": 0.00048515613889321685,
"learning_rate": 3.375634517766498e-06,
"loss": -0.0202,
"num_tokens": 157173004.0,
"reward": 1.236108160018921,
"reward_std": 0.1727673351764679,
"rewards/accuracy_reward": 0.7004340291023254,
"rewards/brier_reward": 0.7907795429229736,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9809895873069763,
"rewards/mean_confidence_reward": 0.6519270777702332,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01710069444444442,
"completions/max_length": 3602.2,
"completions/max_terminated_length": 3602.2,
"completions/mean_length": 690.432568359375,
"completions/mean_terminated_length": 702.5093383789062,
"completions/min_length": 0.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.19199760002999963,
"grad_norm": 0.0005238919984549284,
"learning_rate": 3.2487309644670053e-06,
"loss": -0.0204,
"num_tokens": 168180067.0,
"reward": 1.2182461738586425,
"reward_std": 0.17620865106582642,
"rewards/accuracy_reward": 0.6709201455116272,
"rewards/brier_reward": 0.7827465415000916,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9828124880790711,
"rewards/mean_confidence_reward": 0.6536284565925599,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01605902777777779,
"completions/max_length": 3918.6,
"completions/max_terminated_length": 3918.6,
"completions/mean_length": 671.5053955078125,
"completions/mean_terminated_length": 682.5617919921875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 224.4,
"epoch": 0.2039974500318746,
"grad_norm": 0.0005350089631974697,
"learning_rate": 3.121827411167513e-06,
"loss": -0.0179,
"num_tokens": 179003009.0,
"reward": 1.2336432933807373,
"reward_std": 0.1719420313835144,
"rewards/accuracy_reward": 0.6908854126930237,
"rewards/brier_reward": 0.7926206350326538,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9837673664093017,
"rewards/mean_confidence_reward": 0.6512760162353516,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.016840277777777767,
"completions/max_length": 3078.4,
"completions/max_terminated_length": 3078.4,
"completions/mean_length": 637.0895751953125,
"completions/mean_terminated_length": 648.0412475585938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.8,
"epoch": 0.2159973000337496,
"grad_norm": 0.0005947561003267765,
"learning_rate": 2.9949238578680207e-06,
"loss": -0.0182,
"num_tokens": 189410953.0,
"reward": 1.2236034631729127,
"reward_std": 0.17065767645835878,
"rewards/accuracy_reward": 0.6772569417953491,
"rewards/brier_reward": 0.7867769002914429,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9831597208976746,
"rewards/mean_confidence_reward": 0.6615538239479065,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.011024305555555558,
"completions/max_length": 3396.0,
"completions/max_terminated_length": 3396.0,
"completions/mean_length": 646.1296875,
"completions/mean_terminated_length": 653.329443359375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 181.2,
"epoch": 0.22799715003562457,
"grad_norm": 0.0004906430258415639,
"learning_rate": 2.8680203045685284e-06,
"loss": -0.0109,
"num_tokens": 199946047.0,
"reward": 1.2180467605590821,
"reward_std": 0.15326592326164246,
"rewards/accuracy_reward": 0.66171875,
"rewards/brier_reward": 0.7853862762451171,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9889756798744201,
"rewards/mean_confidence_reward": 0.6348177313804626,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 3291.2,
"completions/max_terminated_length": 3291.2,
"completions/mean_length": 667.8658081054688,
"completions/mean_terminated_length": 678.49951171875,
"completions/min_length": 0.0,
"completions/min_terminated_length": 193.2,
"epoch": 0.23999700003749952,
"grad_norm": 0.000494258594699204,
"learning_rate": 2.7411167512690357e-06,
"loss": -0.0162,
"num_tokens": 210738933.0,
"reward": 1.2323055982589721,
"reward_std": 0.16659289300441743,
"rewards/accuracy_reward": 0.6868055462837219,
"rewards/brier_reward": 0.7935048103332519,
"rewards/confidence_one_or_zero": 8.680555620230735e-05,
"rewards/format_reward": 0.9842881917953491,
"rewards/mean_confidence_reward": 0.6296585679054261,
"step": 100
},
{
"epoch": 0.23999700003749952,
"eval_completions/clipped_ratio": 0.01128472222222221,
"eval_completions/max_length": 2818.6666666666665,
"eval_completions/max_terminated_length": 2818.6666666666665,
"eval_completions/mean_length": 672.4380289713541,
"eval_completions/mean_terminated_length": 680.0551249186198,
"eval_completions/min_length": 57.666666666666664,
"eval_completions/min_terminated_length": 268.8333333333333,
"eval_loss": 0.0,
"eval_num_tokens": 210738933.0,
"eval_reward": 1.2182661890983582,
"eval_reward_std": 0.33711118002732593,
"eval_rewards/accuracy_reward": 0.6614583333333334,
"eval_rewards/brier_reward": 0.7889496286710104,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.9861111044883728,
"eval_rewards/mean_confidence_reward": 0.6596354246139526,
"eval_runtime": 314.2252,
"eval_samples_per_second": 3.182,
"eval_steps_per_second": 0.019,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01831597222222221,
"completions/max_length": 3792.2,
"completions/max_terminated_length": 3792.2,
"completions/mean_length": 671.8076416015625,
"completions/mean_terminated_length": 684.3883666992188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 204.8,
"epoch": 0.2519968500393745,
"grad_norm": 0.0006207000697031617,
"learning_rate": 2.6142131979695434e-06,
"loss": -0.0184,
"num_tokens": 221555021.0,
"reward": 1.2260079383850098,
"reward_std": 0.16747182309627534,
"rewards/accuracy_reward": 0.6787326455116272,
"rewards/brier_reward": 0.7916723132133484,
"rewards/confidence_one_or_zero": 0.0001736111124046147,
"rewards/format_reward": 0.9815972089767456,
"rewards/mean_confidence_reward": 0.6814884305000305,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.012586805555555535,
"completions/max_length": 3638.2,
"completions/max_terminated_length": 3638.2,
"completions/mean_length": 690.3018188476562,
"completions/mean_terminated_length": 699.1578125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 216.0,
"epoch": 0.2639967000412495,
"grad_norm": 0.0006169977132230997,
"learning_rate": 2.487309644670051e-06,
"loss": -0.0118,
"num_tokens": 232615746.0,
"reward": 1.2521895170211792,
"reward_std": 0.16357295215129852,
"rewards/accuracy_reward": 0.7092881917953491,
"rewards/brier_reward": 0.8077497601509094,
"rewards/confidence_one_or_zero": 0.0001736111124046147,
"rewards/format_reward": 0.9873263955116272,
"rewards/mean_confidence_reward": 0.7294313907623291,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015538194444444441,
"completions/max_length": 3454.4,
"completions/max_terminated_length": 3454.4,
"completions/mean_length": 684.7499877929688,
"completions/mean_terminated_length": 695.5743530273437,
"completions/min_length": 0.0,
"completions/min_terminated_length": 209.4,
"epoch": 0.27599655004312446,
"grad_norm": 0.0005687119555659592,
"learning_rate": 2.3604060913705588e-06,
"loss": -0.0154,
"num_tokens": 243583266.0,
"reward": 1.223623776435852,
"reward_std": 0.16944999396800994,
"rewards/accuracy_reward": 0.6738715291023254,
"rewards/brier_reward": 0.7888993144035339,
"rewards/confidence_one_or_zero": 0.0004340277810115367,
"rewards/format_reward": 0.9844618082046509,
"rewards/mean_confidence_reward": 0.7369444251060486,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009201388888888884,
"completions/max_length": 3064.6,
"completions/max_terminated_length": 3064.6,
"completions/mean_length": 670.030126953125,
"completions/mean_terminated_length": 676.1909423828125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 231.4,
"epoch": 0.28799640004499943,
"grad_norm": 0.0004680782149080187,
"learning_rate": 2.233502538071066e-06,
"loss": -0.009,
"num_tokens": 254383869.0,
"reward": 1.2492746114730835,
"reward_std": 0.15400342345237733,
"rewards/accuracy_reward": 0.7008680582046509,
"rewards/brier_reward": 0.8069549679756165,
"rewards/confidence_one_or_zero": 8.680555620230735e-05,
"rewards/format_reward": 0.9907118082046509,
"rewards/mean_confidence_reward": 0.7179887175559998,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0064236111111110935,
"completions/max_length": 2877.4,
"completions/max_terminated_length": 2877.4,
"completions/mean_length": 677.3794311523437,
"completions/mean_terminated_length": 681.78896484375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 220.8,
"epoch": 0.2999962500468744,
"grad_norm": 0.0006309309974312782,
"learning_rate": 2.1065989847715737e-06,
"loss": -0.0074,
"num_tokens": 265304944.0,
"reward": 1.2552873849868775,
"reward_std": 0.1533792197704315,
"rewards/accuracy_reward": 0.7022569417953491,
"rewards/brier_reward": 0.8147274374961853,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9935763835906982,
"rewards/mean_confidence_reward": 0.6978819608688355,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.013888888888888885,
"completions/max_length": 3585.8,
"completions/max_terminated_length": 3585.8,
"completions/mean_length": 700.1473266601563,
"completions/mean_terminated_length": 710.0007690429687,
"completions/min_length": 0.0,
"completions/min_terminated_length": 205.2,
"epoch": 0.3119961000487494,
"grad_norm": 0.0005744310328736901,
"learning_rate": 1.9796954314720814e-06,
"loss": -0.0155,
"num_tokens": 276495441.0,
"reward": 1.2269079685211182,
"reward_std": 0.16605815887451172,
"rewards/accuracy_reward": 0.6709201335906982,
"rewards/brier_reward": 0.7968581199645997,
"rewards/confidence_one_or_zero": 0.00026041667442768814,
"rewards/format_reward": 0.9860243201255798,
"rewards/mean_confidence_reward": 0.666854739189148,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007031249999999978,
"completions/max_length": 3814.4,
"completions/max_terminated_length": 3814.4,
"completions/mean_length": 702.3100708007812,
"completions/mean_terminated_length": 707.2436401367188,
"completions/min_length": 0.0,
"completions/min_terminated_length": 231.2,
"epoch": 0.32399595005062437,
"grad_norm": 0.0005819419166073203,
"learning_rate": 1.852791878172589e-06,
"loss": -0.0068,
"num_tokens": 287679077.0,
"reward": 1.251250958442688,
"reward_std": 0.14554388970136642,
"rewards/accuracy_reward": 0.6952257037162781,
"rewards/brier_reward": 0.8143810749053955,
"rewards/confidence_one_or_zero": 8.680555620230735e-05,
"rewards/format_reward": 0.9928819537162781,
"rewards/mean_confidence_reward": 0.6603732705116272,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00902777777777779,
"completions/max_length": 3521.4,
"completions/max_terminated_length": 3521.4,
"completions/mean_length": 709.2608642578125,
"completions/mean_terminated_length": 715.695703125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 237.6,
"epoch": 0.33599580005249935,
"grad_norm": 0.0005530334892682731,
"learning_rate": 1.7258883248730964e-06,
"loss": -0.0088,
"num_tokens": 298953986.0,
"reward": 1.2382160425186157,
"reward_std": 0.14605504274368286,
"rewards/accuracy_reward": 0.6803819537162781,
"rewards/brier_reward": 0.8050644397735596,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9909722208976746,
"rewards/mean_confidence_reward": 0.6734331607818603,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00746527777777779,
"completions/max_length": 3415.6,
"completions/max_terminated_length": 3415.6,
"completions/mean_length": 710.8110473632812,
"completions/mean_terminated_length": 716.1941772460938,
"completions/min_length": 0.0,
"completions/min_terminated_length": 255.4,
"epoch": 0.34799565005437433,
"grad_norm": 0.0005190471420064569,
"learning_rate": 1.5989847715736043e-06,
"loss": -0.0078,
"num_tokens": 310207137.0,
"reward": 1.271397852897644,
"reward_std": 0.13708409070968627,
"rewards/accuracy_reward": 0.7222222208976745,
"rewards/brier_reward": 0.828024935722351,
"rewards/confidence_one_or_zero": 8.680555620230735e-05,
"rewards/format_reward": 0.9925347208976746,
"rewards/mean_confidence_reward": 0.6911779403686523,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003906249999999978,
"completions/max_length": 3718.4,
"completions/max_terminated_length": 3718.4,
"completions/mean_length": 770.7046997070313,
"completions/mean_terminated_length": 773.7459228515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 239.4,
"epoch": 0.3599955000562493,
"grad_norm": 0.0005380141083151102,
"learning_rate": 1.4720812182741118e-06,
"loss": -0.0025,
"num_tokens": 322195991.0,
"reward": 1.2562948942184449,
"reward_std": 0.1399953231215477,
"rewards/accuracy_reward": 0.7012152671813965,
"rewards/brier_reward": 0.8152669310569763,
"rewards/confidence_one_or_zero": 8.680555620230735e-05,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.6894097328186035,
"step": 150
},
{
"epoch": 0.3599955000562493,
"eval_completions/clipped_ratio": 0.008680555555555561,
"eval_completions/max_length": 2509.0,
"eval_completions/max_terminated_length": 2509.0,
"eval_completions/mean_length": 739.3196309407552,
"eval_completions/mean_terminated_length": 745.8552551269531,
"eval_completions/min_length": 66.16666666666667,
"eval_completions/min_terminated_length": 309.3333333333333,
"eval_loss": 0.0,
"eval_num_tokens": 322195991.0,
"eval_reward": 1.2427564859390259,
"eval_reward_std": 0.3247065891822179,
"eval_rewards/accuracy_reward": 0.6866319477558136,
"eval_rewards/brier_reward": 0.8075478275616964,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.9913194477558136,
"eval_rewards/mean_confidence_reward": 0.6826649407545725,
"eval_runtime": 284.0655,
"eval_samples_per_second": 3.52,
"eval_steps_per_second": 0.021,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.004166666666666652,
"completions/max_length": 2905.4,
"completions/max_terminated_length": 2905.4,
"completions/mean_length": 732.27587890625,
"completions/mean_terminated_length": 735.4041870117187,
"completions/min_length": 0.0,
"completions/min_terminated_length": 278.2,
"epoch": 0.3719953500581243,
"grad_norm": 0.0006105477805249393,
"learning_rate": 1.3451776649746193e-06,
"loss": -0.004,
"num_tokens": 333739521.0,
"reward": 1.2941375255584717,
"reward_std": 0.13965744376182557,
"rewards/accuracy_reward": 0.7543402791023255,
"rewards/brier_reward": 0.8380874633789063,
"rewards/confidence_one_or_zero": 0.0001736111124046147,
"rewards/format_reward": 0.9958333253860474,
"rewards/mean_confidence_reward": 0.6942404508590698,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.008246527777777768,
"completions/max_length": 2973.6,
"completions/max_terminated_length": 2973.6,
"completions/mean_length": 753.7942626953125,
"completions/mean_terminated_length": 760.0675659179688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 269.8,
"epoch": 0.38399520005999926,
"grad_norm": 0.0005153357633389533,
"learning_rate": 1.218274111675127e-06,
"loss": -0.0084,
"num_tokens": 345510527.0,
"reward": 1.2482587575912476,
"reward_std": 0.14498619139194488,
"rewards/accuracy_reward": 0.6890625,
"rewards/brier_reward": 0.8156877160072327,
"rewards/confidence_one_or_zero": 8.680555620230735e-05,
"rewards/format_reward": 0.9917534708976745,
"rewards/mean_confidence_reward": 0.6878429055213928,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007725694444444442,
"completions/max_length": 3382.4,
"completions/max_terminated_length": 3382.4,
"completions/mean_length": 792.88369140625,
"completions/mean_terminated_length": 799.1453125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.39599505006187424,
"grad_norm": 0.0005191068630665541,
"learning_rate": 1.0913705583756345e-06,
"loss": -0.0066,
"num_tokens": 357783619.0,
"reward": 1.2369651794433594,
"reward_std": 0.14206757992506028,
"rewards/accuracy_reward": 0.6712673664093017,
"rewards/brier_reward": 0.8103755116462708,
"rewards/confidence_one_or_zero": 0.0013020833430346102,
"rewards/format_reward": 0.9922743082046509,
"rewards/mean_confidence_reward": 0.6545373201370239,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005381944444444464,
"completions/max_length": 3044.8,
"completions/max_terminated_length": 3044.8,
"completions/mean_length": 775.5628540039063,
"completions/mean_terminated_length": 779.7671752929688,
"completions/min_length": 0.0,
"completions/min_terminated_length": 303.0,
"epoch": 0.4079949000637492,
"grad_norm": 0.0005426673451438546,
"learning_rate": 9.644670050761422e-07,
"loss": -0.0058,
"num_tokens": 369807287.0,
"reward": 1.2703901767730712,
"reward_std": 0.1331357017159462,
"rewards/accuracy_reward": 0.7223090291023254,
"rewards/brier_reward": 0.8238399386405945,
"rewards/confidence_one_or_zero": 0.0006076389108784497,
"rewards/format_reward": 0.9946180701255798,
"rewards/mean_confidence_reward": 0.664151918888092,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.007986111111111093,
"completions/max_length": 3587.6,
"completions/max_terminated_length": 3587.6,
"completions/mean_length": 812.0387084960937,
"completions/mean_terminated_length": 818.592236328125,
"completions/min_length": 0.0,
"completions/min_terminated_length": 253.6,
"epoch": 0.4199947500656242,
"grad_norm": 0.000529599201399833,
"learning_rate": 8.375634517766498e-07,
"loss": -0.0086,
"num_tokens": 382269941.0,
"reward": 1.2632122039794922,
"reward_std": 0.1427206039428711,
"rewards/accuracy_reward": 0.7166666626930237,
"rewards/brier_reward": 0.8178175330162049,
"rewards/confidence_one_or_zero": 0.0006944444554392249,
"rewards/format_reward": 0.9919270992279052,
"rewards/mean_confidence_reward": 0.6496527791023254,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00850694444444442,
"completions/max_length": 3676.6,
"completions/max_terminated_length": 3676.6,
"completions/mean_length": 786.1634643554687,
"completions/mean_terminated_length": 792.9117431640625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 300.8,
"epoch": 0.4319946000674992,
"grad_norm": 0.0004947619745507836,
"learning_rate": 7.106598984771574e-07,
"loss": -0.0086,
"num_tokens": 394426512.0,
"reward": 1.269280982017517,
"reward_std": 0.14375588595867156,
"rewards/accuracy_reward": 0.7263888955116272,
"rewards/brier_reward": 0.8207532525062561,
"rewards/confidence_one_or_zero": 0.0011284722306299955,
"rewards/format_reward": 0.991406238079071,
"rewards/mean_confidence_reward": 0.6751345515251159,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.011545138888888884,
"completions/max_length": 3260.6,
"completions/max_terminated_length": 3260.6,
"completions/mean_length": 785.8776123046875,
"completions/mean_terminated_length": 795.1585571289063,
"completions/min_length": 0.0,
"completions/min_terminated_length": 274.0,
"epoch": 0.44399445006937416,
"grad_norm": 0.0005510540795512497,
"learning_rate": 5.83756345177665e-07,
"loss": -0.0098,
"num_tokens": 406569870.0,
"reward": 1.2565922260284423,
"reward_std": 0.1406260371208191,
"rewards/accuracy_reward": 0.7071180582046509,
"rewards/brier_reward": 0.8175978541374207,
"rewards/confidence_one_or_zero": 0.00034722223062999547,
"rewards/format_reward": 0.9884548544883728,
"rewards/mean_confidence_reward": 0.68436199426651,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 3258.4,
"completions/max_terminated_length": 3258.4,
"completions/mean_length": 780.2356811523438,
"completions/mean_terminated_length": 786.3969848632812,
"completions/min_length": 0.0,
"completions/min_terminated_length": 293.6,
"epoch": 0.45599430007124914,
"grad_norm": 0.0005112453945912421,
"learning_rate": 4.568527918781726e-07,
"loss": -0.007,
"num_tokens": 418641129.0,
"reward": 1.2805275201797486,
"reward_std": 0.14128611385822296,
"rewards/accuracy_reward": 0.74140625,
"rewards/brier_reward": 0.8274471998214722,
"rewards/confidence_one_or_zero": 8.680555620230735e-05,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.7002317667007446,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01128472222222221,
"completions/max_length": 3505.8,
"completions/max_terminated_length": 3505.8,
"completions/mean_length": 798.0532104492188,
"completions/mean_terminated_length": 807.356103515625,
"completions/min_length": 0.0,
"completions/min_terminated_length": 268.8,
"epoch": 0.46799415007312406,
"grad_norm": 0.0005772958975285292,
"learning_rate": 3.2994923857868026e-07,
"loss": -0.011,
"num_tokens": 430915566.0,
"reward": 1.2434212923049928,
"reward_std": 0.15206148028373717,
"rewards/accuracy_reward": 0.6927083253860473,
"rewards/brier_reward": 0.8054919719696045,
"rewards/confidence_one_or_zero": 0.0004340277868323028,
"rewards/format_reward": 0.9886284708976746,
"rewards/mean_confidence_reward": 0.6865755438804626,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.006857638888888884,
"completions/max_length": 3417.0,
"completions/max_terminated_length": 3417.0,
"completions/mean_length": 776.532470703125,
"completions/mean_terminated_length": 781.9117309570313,
"completions/min_length": 0.0,
"completions/min_terminated_length": 274.6,
"epoch": 0.47999400007499904,
"grad_norm": 0.0006025061593391001,
"learning_rate": 2.0304568527918783e-07,
"loss": -0.007,
"num_tokens": 442929028.0,
"reward": 1.2590484380722047,
"reward_std": 0.14006066024303437,
"rewards/accuracy_reward": 0.7050347089767456,
"rewards/brier_reward": 0.8199058055877686,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9931423544883728,
"rewards/mean_confidence_reward": 0.6951475501060486,
"step": 200
},
{
"epoch": 0.47999400007499904,
"eval_completions/clipped_ratio": 0.006770833333333337,
"eval_completions/max_length": 2710.8333333333335,
"eval_completions/max_terminated_length": 2710.8333333333335,
"eval_completions/mean_length": 788.9212036132812,
"eval_completions/mean_terminated_length": 794.1668294270834,
"eval_completions/min_length": 158.66666666666666,
"eval_completions/min_terminated_length": 343.0,
"eval_loss": 0.0,
"eval_num_tokens": 442929028.0,
"eval_reward": 1.254876693089803,
"eval_reward_std": 0.30982651313145954,
"eval_rewards/accuracy_reward": 0.6918402711550394,
"eval_rewards/brier_reward": 0.8231076200803121,
"eval_rewards/confidence_one_or_zero": 0.0008680555814256271,
"eval_rewards/format_reward": 0.9947916666666666,
"eval_rewards/mean_confidence_reward": 0.6819444298744202,
"eval_runtime": 275.8753,
"eval_samples_per_second": 3.625,
"eval_steps_per_second": 0.022,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0052951388888889065,
"completions/max_length": 3538.0,
"completions/max_terminated_length": 3538.0,
"completions/mean_length": 799.853662109375,
"completions/mean_terminated_length": 804.1119750976562,
"completions/min_length": 0.0,
"completions/min_terminated_length": 265.2,
"epoch": 0.491993850076874,
"grad_norm": 0.0005411679157987237,
"learning_rate": 7.614213197969544e-08,
"loss": -0.0035,
"num_tokens": 455209294.0,
"reward": 1.2997782230377197,
"reward_std": 0.1340979665517807,
"rewards/accuracy_reward": 0.7611111044883728,
"rewards/brier_reward": 0.8437265396118164,
"rewards/confidence_one_or_zero": 0.0005208333546761424,
"rewards/format_reward": 0.9947048664093018,
"rewards/mean_confidence_reward": 0.6931857466697693,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003906249999999963,
"completions/max_length": 3058.0,
"completions/max_terminated_length": 3058.0,
"completions/mean_length": 792.8047078450521,
"completions/mean_terminated_length": 795.9739786783854,
"completions/min_length": 0.0,
"completions/min_terminated_length": 277.6666666666667,
"epoch": 0.49919376007799904,
"num_tokens": 462555208.0,
"reward": 1.2620628277460735,
"reward_std": 0.13802013049523035,
"rewards/accuracy_reward": 0.7123842636744181,
"rewards/brier_reward": 0.8156337340672811,
"rewards/confidence_one_or_zero": 0.0015914352067435782,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.6858723759651184,
"step": 208,
"total_flos": 0.0,
"train_loss": -0.01281338286263725,
"train_runtime": 80412.2515,
"train_samples_per_second": 0.187,
"train_steps_per_second": 0.003
}
],
"logging_steps": 5,
"max_steps": 208,
"num_input_tokens_seen": 462555208,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}