{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.49919376007799904, "eval_steps": 50, "global_step": 208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021875, "completions/max_length": 4010.4, "completions/max_terminated_length": 4010.4, "completions/mean_length": 503.759375, "completions/mean_terminated_length": 515.0101257324219, "completions/min_length": 0.0, "completions/min_terminated_length": 6.8, "epoch": 0.011999850001874977, "grad_norm": 0.016190193593502045, "learning_rate": 2.2727272727272728e-06, "loss": 0.006, "num_tokens": 8917516.0, "reward": 0.6058741450309754, "reward_std": 0.534743320941925, "rewards/accuracy_reward": 0.26883680522441866, "rewards/brier_reward": 0.3217940092086792, "rewards/confidence_one_or_zero": 0.34730902314186096, "rewards/format_reward": 0.6210937380790711, "rewards/mean_confidence_reward": 0.8363143563270569, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009114583333333325, "completions/max_length": 3499.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 406.26458740234375, "completions/mean_terminated_length": 410.0914794921875, "completions/min_length": 0.0, "completions/min_terminated_length": 25.2, "epoch": 0.023999700003749954, "grad_norm": 0.005708878394216299, "learning_rate": 4.5454545454545455e-06, "loss": -0.0094, "num_tokens": 16680404.0, "reward": 0.8901966333389282, "reward_std": 0.4035582482814789, "rewards/accuracy_reward": 0.37309027314186094, "rewards/brier_reward": 0.4747287094593048, "rewards/confidence_one_or_zero": 0.2348090261220932, "rewards/format_reward": 0.9325520753860473, "rewards/mean_confidence_reward": 0.8743563175201416, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01293402777777779, "completions/max_length": 3835.0, "completions/max_terminated_length": 3835.0, "completions/mean_length": 452.3516540527344, "completions/mean_terminated_length": 458.4701354980469, "completions/min_length": 0.0, "completions/min_terminated_length": 91.4, "epoch": 0.03599955000562493, "grad_norm": 0.0010099423816427588, "learning_rate": 4.898477157360406e-06, "loss": -0.0129, "num_tokens": 24993479.0, "reward": 1.066845965385437, "reward_std": 0.29507095515728, "rewards/accuracy_reward": 0.49340277910232544, "rewards/brier_reward": 0.6558980584144593, "rewards/confidence_one_or_zero": 0.02864583337213844, "rewards/format_reward": 0.9843750119209289, "rewards/mean_confidence_reward": 0.7727026224136353, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.018142361111111116, "completions/max_length": 3412.8, "completions/max_terminated_length": 3412.8, "completions/mean_length": 560.2568603515625, "completions/mean_terminated_length": 570.698095703125, "completions/min_length": 0.0, "completions/min_terminated_length": 136.6, "epoch": 0.04799940000749991, "grad_norm": 0.0005838912329636514, "learning_rate": 4.771573604060914e-06, "loss": -0.018, "num_tokens": 34561334.0, "reward": 1.166806983947754, "reward_std": 0.2173929274082184, "rewards/accuracy_reward": 0.6097222089767456, "rewards/brier_reward": 0.7459280967712403, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9779513955116272, "rewards/mean_confidence_reward": 0.6061267256736755, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016753472222222232, "completions/max_length": 3364.8, "completions/max_terminated_length": 3364.8, "completions/mean_length": 599.8819458007813, "completions/mean_terminated_length": 610.2050415039063, "completions/min_length": 0.0, "completions/min_terminated_length": 154.0, "epoch": 0.05999925000937488, "grad_norm": 0.0008427119464613497, "learning_rate": 4.644670050761422e-06, "loss": -0.0155, "num_tokens": 44596422.0, "reward": 1.193192458152771, "reward_std": 0.18251443803310394, "rewards/accuracy_reward": 0.6541666746139526, "rewards/brier_reward": 0.7504361510276795, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9817708253860473, "rewards/mean_confidence_reward": 0.5526909828186035, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022048611111111095, "completions/max_length": 3541.2, "completions/max_terminated_length": 3541.2, "completions/mean_length": 644.44619140625, "completions/mean_terminated_length": 659.1096435546875, "completions/min_length": 0.0, "completions/min_terminated_length": 189.6, "epoch": 0.07199910001124986, "grad_norm": 0.0007436223677359521, "learning_rate": 4.5177664974619295e-06, "loss": -0.0201, "num_tokens": 55130362.0, "reward": 1.1905469655990601, "reward_std": 0.19011308550834655, "rewards/accuracy_reward": 0.6495659589767456, "rewards/brier_reward": 0.7538252711296082, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9776909708976745, "rewards/mean_confidence_reward": 0.5784592151641845, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025260416666666653, "completions/max_length": 3744.8, "completions/max_terminated_length": 3744.8, "completions/mean_length": 655.0560791015625, "completions/mean_terminated_length": 672.1588012695313, "completions/min_length": 0.0, "completions/min_terminated_length": 219.0, "epoch": 0.08399895001312484, "grad_norm": 0.0005305504309944808, "learning_rate": 4.390862944162436e-06, "loss": -0.0234, "num_tokens": 65754048.0, "reward": 1.1947672843933106, "reward_std": 0.18359957337379457, "rewards/accuracy_reward": 0.6561631917953491, "rewards/brier_reward": 0.7592274188995362, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.974131953716278, "rewards/mean_confidence_reward": 0.5979774236679077, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02230902777777779, "completions/max_length": 3770.0, "completions/max_terminated_length": 3770.0, "completions/mean_length": 660.4337768554688, "completions/mean_terminated_length": 675.5411987304688, "completions/min_length": 0.0, "completions/min_terminated_length": 223.4, "epoch": 0.09599880001499982, "grad_norm": 0.0005542829167097807, "learning_rate": 4.263959390862945e-06, "loss": -0.0235, "num_tokens": 76481765.0, "reward": 1.2054712533950807, "reward_std": 0.1817381888628006, "rewards/accuracy_reward": 0.6667534708976746, "rewards/brier_reward": 0.7666590690612793, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9775173544883728, "rewards/mean_confidence_reward": 0.6286675214767456, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01597222222222221, "completions/max_length": 3583.8, "completions/max_terminated_length": 3583.8, "completions/mean_length": 703.358935546875, "completions/mean_terminated_length": 714.7557250976563, "completions/min_length": 0.0, "completions/min_terminated_length": 240.8, "epoch": 0.1079986500168748, "grad_norm": 0.0005376818589866161, "learning_rate": 4.137055837563453e-06, "loss": -0.0162, "num_tokens": 87719724.0, "reward": 1.2115298509597778, "reward_std": 0.18161508142948152, "rewards/accuracy_reward": 0.6661458253860474, "rewards/brier_reward": 0.7729592084884643, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9839409708976745, "rewards/mean_confidence_reward": 0.6761284708976746, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.018055555555555537, "completions/max_length": 3400.2, "completions/max_terminated_length": 3400.2, "completions/mean_length": 739.211376953125, "completions/mean_terminated_length": 752.7696411132813, "completions/min_length": 0.0, "completions/min_terminated_length": 249.8, "epoch": 0.11999850001874976, "grad_norm": 0.0006366602028720081, "learning_rate": 4.0101522842639595e-06, "loss": -0.019, "num_tokens": 99333039.0, "reward": 1.2029428243637086, "reward_std": 0.18159882426261903, "rewards/accuracy_reward": 0.6561631917953491, "rewards/brier_reward": 0.7679378151893616, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9817708373069763, "rewards/mean_confidence_reward": 0.6839114427566528, "step": 50 }, { "epoch": 0.11999850001874976, "eval_completions/clipped_ratio": 0.021701388888888895, "eval_completions/max_length": 2201.0, "eval_completions/max_terminated_length": 2201.0, "eval_completions/mean_length": 737.701416015625, "eval_completions/mean_terminated_length": 753.9890950520834, "eval_completions/min_length": 65.66666666666667, "eval_completions/min_terminated_length": 332.6666666666667, "eval_loss": 0.0, "eval_num_tokens": 99333039.0, "eval_reward": 1.199620544910431, "eval_reward_std": 0.36773162086804706, "eval_rewards/accuracy_reward": 0.6571180721124014, "eval_rewards/brier_reward": 0.7664149304231008, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9756944477558136, "eval_rewards/mean_confidence_reward": 0.6796006560325623, "eval_runtime": 321.4866, "eval_samples_per_second": 3.111, "eval_steps_per_second": 0.019, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02265625, "completions/max_length": 3476.6, "completions/max_terminated_length": 3476.6, "completions/mean_length": 771.7972290039063, "completions/mean_terminated_length": 789.8291259765625, "completions/min_length": 0.0, "completions/min_terminated_length": 241.8, "epoch": 0.13199835002062474, "grad_norm": 0.0005807679262943566, "learning_rate": 3.883248730964467e-06, "loss": -0.0199, "num_tokens": 111304719.0, "reward": 1.207960844039917, "reward_std": 0.18596434891223906, "rewards/accuracy_reward": 0.6674479126930237, "rewards/brier_reward": 0.7711163401603699, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.97734375, "rewards/mean_confidence_reward": 0.6762413263320923, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02265625, "completions/max_length": 3830.4, "completions/max_terminated_length": 3830.4, "completions/mean_length": 789.2795166015625, "completions/mean_terminated_length": 807.6457641601562, "completions/min_length": 0.0, "completions/min_terminated_length": 274.8, "epoch": 0.14399820002249972, "grad_norm": 0.0004933876334689558, "learning_rate": 3.756345177664975e-06, "loss": -0.023, "num_tokens": 123493795.0, "reward": 1.1914230823516845, "reward_std": 0.19178390204906465, "rewards/accuracy_reward": 0.642100703716278, "rewards/brier_reward": 0.7638221859931946, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9769097208976746, "rewards/mean_confidence_reward": 0.6678263902664184, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.020225694444444442, "completions/max_length": 3798.4, "completions/max_terminated_length": 3798.4, "completions/mean_length": 747.4920166015625, "completions/mean_terminated_length": 762.8623779296875, "completions/min_length": 0.0, "completions/min_terminated_length": 239.8, "epoch": 0.1559980500243747, "grad_norm": 0.0006233630119822919, "learning_rate": 3.629441624365482e-06, "loss": -0.0202, "num_tokens": 135198951.0, "reward": 1.2134350299835206, "reward_std": 0.17404497265815735, "rewards/accuracy_reward": 0.6685763835906983, "rewards/brier_reward": 0.7786794900894165, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9796006917953491, "rewards/mean_confidence_reward": 0.6638238191604614, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.028559027777777767, "completions/max_length": 3473.4, "completions/max_terminated_length": 3473.4, "completions/mean_length": 696.1581665039063, "completions/mean_terminated_length": 716.5174926757812, "completions/min_length": 0.0, "completions/min_terminated_length": 204.2, "epoch": 0.16799790002624967, "grad_norm": 0.000540408305823803, "learning_rate": 3.5025380710659903e-06, "loss": -0.0296, "num_tokens": 146296837.0, "reward": 1.195624089241028, "reward_std": 0.1801719158887863, "rewards/accuracy_reward": 0.6525173664093018, "rewards/brier_reward": 0.7673637270927429, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9713541746139527, "rewards/mean_confidence_reward": 0.640026044845581, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.018923611111111117, "completions/max_length": 3295.0, "completions/max_terminated_length": 3295.0, "completions/mean_length": 678.0617309570313, "completions/mean_terminated_length": 691.085107421875, "completions/min_length": 0.0, "completions/min_terminated_length": 235.2, "epoch": 0.17999775002812465, "grad_norm": 0.00048515613889321685, "learning_rate": 3.375634517766498e-06, "loss": -0.0202, "num_tokens": 157173004.0, "reward": 1.236108160018921, "reward_std": 0.1727673351764679, "rewards/accuracy_reward": 0.7004340291023254, "rewards/brier_reward": 0.7907795429229736, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9809895873069763, "rewards/mean_confidence_reward": 0.6519270777702332, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01710069444444442, "completions/max_length": 3602.2, "completions/max_terminated_length": 3602.2, "completions/mean_length": 690.432568359375, "completions/mean_terminated_length": 702.5093383789062, "completions/min_length": 0.0, "completions/min_terminated_length": 192.0, "epoch": 0.19199760002999963, "grad_norm": 0.0005238919984549284, "learning_rate": 3.2487309644670053e-06, "loss": -0.0204, "num_tokens": 168180067.0, "reward": 1.2182461738586425, "reward_std": 0.17620865106582642, "rewards/accuracy_reward": 0.6709201455116272, "rewards/brier_reward": 0.7827465415000916, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9828124880790711, "rewards/mean_confidence_reward": 0.6536284565925599, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01605902777777779, "completions/max_length": 3918.6, "completions/max_terminated_length": 3918.6, "completions/mean_length": 671.5053955078125, "completions/mean_terminated_length": 682.5617919921875, "completions/min_length": 0.0, "completions/min_terminated_length": 224.4, "epoch": 0.2039974500318746, "grad_norm": 0.0005350089631974697, "learning_rate": 3.121827411167513e-06, "loss": -0.0179, "num_tokens": 179003009.0, "reward": 1.2336432933807373, "reward_std": 0.1719420313835144, "rewards/accuracy_reward": 0.6908854126930237, "rewards/brier_reward": 0.7926206350326538, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837673664093017, "rewards/mean_confidence_reward": 0.6512760162353516, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016840277777777767, "completions/max_length": 3078.4, "completions/max_terminated_length": 3078.4, "completions/mean_length": 637.0895751953125, "completions/mean_terminated_length": 648.0412475585938, "completions/min_length": 0.0, "completions/min_terminated_length": 211.8, "epoch": 0.2159973000337496, "grad_norm": 0.0005947561003267765, "learning_rate": 2.9949238578680207e-06, "loss": -0.0182, "num_tokens": 189410953.0, "reward": 1.2236034631729127, "reward_std": 0.17065767645835878, "rewards/accuracy_reward": 0.6772569417953491, "rewards/brier_reward": 0.7867769002914429, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9831597208976746, "rewards/mean_confidence_reward": 0.6615538239479065, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011024305555555558, "completions/max_length": 3396.0, "completions/max_terminated_length": 3396.0, "completions/mean_length": 646.1296875, "completions/mean_terminated_length": 653.329443359375, "completions/min_length": 0.0, "completions/min_terminated_length": 181.2, "epoch": 0.22799715003562457, "grad_norm": 0.0004906430258415639, "learning_rate": 2.8680203045685284e-06, "loss": -0.0109, "num_tokens": 199946047.0, "reward": 1.2180467605590821, "reward_std": 0.15326592326164246, "rewards/accuracy_reward": 0.66171875, "rewards/brier_reward": 0.7853862762451171, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889756798744201, "rewards/mean_confidence_reward": 0.6348177313804626, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3291.2, "completions/max_terminated_length": 3291.2, "completions/mean_length": 667.8658081054688, "completions/mean_terminated_length": 678.49951171875, "completions/min_length": 0.0, "completions/min_terminated_length": 193.2, "epoch": 0.23999700003749952, "grad_norm": 0.000494258594699204, "learning_rate": 2.7411167512690357e-06, "loss": -0.0162, "num_tokens": 210738933.0, "reward": 1.2323055982589721, "reward_std": 0.16659289300441743, "rewards/accuracy_reward": 0.6868055462837219, "rewards/brier_reward": 0.7935048103332519, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9842881917953491, "rewards/mean_confidence_reward": 0.6296585679054261, "step": 100 }, { "epoch": 0.23999700003749952, "eval_completions/clipped_ratio": 0.01128472222222221, "eval_completions/max_length": 2818.6666666666665, "eval_completions/max_terminated_length": 2818.6666666666665, "eval_completions/mean_length": 672.4380289713541, "eval_completions/mean_terminated_length": 680.0551249186198, "eval_completions/min_length": 57.666666666666664, "eval_completions/min_terminated_length": 268.8333333333333, "eval_loss": 0.0, "eval_num_tokens": 210738933.0, "eval_reward": 1.2182661890983582, "eval_reward_std": 0.33711118002732593, "eval_rewards/accuracy_reward": 0.6614583333333334, "eval_rewards/brier_reward": 0.7889496286710104, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9861111044883728, "eval_rewards/mean_confidence_reward": 0.6596354246139526, "eval_runtime": 314.2252, "eval_samples_per_second": 3.182, "eval_steps_per_second": 0.019, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01831597222222221, "completions/max_length": 3792.2, "completions/max_terminated_length": 3792.2, "completions/mean_length": 671.8076416015625, "completions/mean_terminated_length": 684.3883666992188, "completions/min_length": 0.0, "completions/min_terminated_length": 204.8, "epoch": 0.2519968500393745, "grad_norm": 0.0006207000697031617, "learning_rate": 2.6142131979695434e-06, "loss": -0.0184, "num_tokens": 221555021.0, "reward": 1.2260079383850098, "reward_std": 0.16747182309627534, "rewards/accuracy_reward": 0.6787326455116272, "rewards/brier_reward": 0.7916723132133484, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9815972089767456, "rewards/mean_confidence_reward": 0.6814884305000305, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012586805555555535, "completions/max_length": 3638.2, "completions/max_terminated_length": 3638.2, "completions/mean_length": 690.3018188476562, "completions/mean_terminated_length": 699.1578125, "completions/min_length": 0.0, "completions/min_terminated_length": 216.0, "epoch": 0.2639967000412495, "grad_norm": 0.0006169977132230997, "learning_rate": 2.487309644670051e-06, "loss": -0.0118, "num_tokens": 232615746.0, "reward": 1.2521895170211792, "reward_std": 0.16357295215129852, "rewards/accuracy_reward": 0.7092881917953491, "rewards/brier_reward": 0.8077497601509094, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9873263955116272, "rewards/mean_confidence_reward": 0.7294313907623291, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015538194444444441, "completions/max_length": 3454.4, "completions/max_terminated_length": 3454.4, "completions/mean_length": 684.7499877929688, "completions/mean_terminated_length": 695.5743530273437, "completions/min_length": 0.0, "completions/min_terminated_length": 209.4, "epoch": 0.27599655004312446, "grad_norm": 0.0005687119555659592, "learning_rate": 2.3604060913705588e-06, "loss": -0.0154, "num_tokens": 243583266.0, "reward": 1.223623776435852, "reward_std": 0.16944999396800994, "rewards/accuracy_reward": 0.6738715291023254, "rewards/brier_reward": 0.7888993144035339, "rewards/confidence_one_or_zero": 0.0004340277810115367, "rewards/format_reward": 0.9844618082046509, "rewards/mean_confidence_reward": 0.7369444251060486, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009201388888888884, "completions/max_length": 3064.6, "completions/max_terminated_length": 3064.6, "completions/mean_length": 670.030126953125, "completions/mean_terminated_length": 676.1909423828125, "completions/min_length": 0.0, "completions/min_terminated_length": 231.4, "epoch": 0.28799640004499943, "grad_norm": 0.0004680782149080187, "learning_rate": 2.233502538071066e-06, "loss": -0.009, "num_tokens": 254383869.0, "reward": 1.2492746114730835, "reward_std": 0.15400342345237733, "rewards/accuracy_reward": 0.7008680582046509, "rewards/brier_reward": 0.8069549679756165, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9907118082046509, "rewards/mean_confidence_reward": 0.7179887175559998, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0064236111111110935, "completions/max_length": 2877.4, "completions/max_terminated_length": 2877.4, "completions/mean_length": 677.3794311523437, "completions/mean_terminated_length": 681.78896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 220.8, "epoch": 0.2999962500468744, "grad_norm": 0.0006309309974312782, "learning_rate": 2.1065989847715737e-06, "loss": -0.0074, "num_tokens": 265304944.0, "reward": 1.2552873849868775, "reward_std": 0.1533792197704315, "rewards/accuracy_reward": 0.7022569417953491, "rewards/brier_reward": 0.8147274374961853, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9935763835906982, "rewards/mean_confidence_reward": 0.6978819608688355, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013888888888888885, "completions/max_length": 3585.8, "completions/max_terminated_length": 3585.8, "completions/mean_length": 700.1473266601563, "completions/mean_terminated_length": 710.0007690429687, "completions/min_length": 0.0, "completions/min_terminated_length": 205.2, "epoch": 0.3119961000487494, "grad_norm": 0.0005744310328736901, "learning_rate": 1.9796954314720814e-06, "loss": -0.0155, "num_tokens": 276495441.0, "reward": 1.2269079685211182, "reward_std": 0.16605815887451172, "rewards/accuracy_reward": 0.6709201335906982, "rewards/brier_reward": 0.7968581199645997, "rewards/confidence_one_or_zero": 0.00026041667442768814, "rewards/format_reward": 0.9860243201255798, "rewards/mean_confidence_reward": 0.666854739189148, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007031249999999978, "completions/max_length": 3814.4, "completions/max_terminated_length": 3814.4, "completions/mean_length": 702.3100708007812, "completions/mean_terminated_length": 707.2436401367188, "completions/min_length": 0.0, "completions/min_terminated_length": 231.2, "epoch": 0.32399595005062437, "grad_norm": 0.0005819419166073203, "learning_rate": 1.852791878172589e-06, "loss": -0.0068, "num_tokens": 287679077.0, "reward": 1.251250958442688, "reward_std": 0.14554388970136642, "rewards/accuracy_reward": 0.6952257037162781, "rewards/brier_reward": 0.8143810749053955, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9928819537162781, "rewards/mean_confidence_reward": 0.6603732705116272, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00902777777777779, "completions/max_length": 3521.4, "completions/max_terminated_length": 3521.4, "completions/mean_length": 709.2608642578125, "completions/mean_terminated_length": 715.695703125, "completions/min_length": 0.0, "completions/min_terminated_length": 237.6, "epoch": 0.33599580005249935, "grad_norm": 0.0005530334892682731, "learning_rate": 1.7258883248730964e-06, "loss": -0.0088, "num_tokens": 298953986.0, "reward": 1.2382160425186157, "reward_std": 0.14605504274368286, "rewards/accuracy_reward": 0.6803819537162781, "rewards/brier_reward": 0.8050644397735596, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9909722208976746, "rewards/mean_confidence_reward": 0.6734331607818603, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00746527777777779, "completions/max_length": 3415.6, "completions/max_terminated_length": 3415.6, "completions/mean_length": 710.8110473632812, "completions/mean_terminated_length": 716.1941772460938, "completions/min_length": 0.0, "completions/min_terminated_length": 255.4, "epoch": 0.34799565005437433, "grad_norm": 0.0005190471420064569, "learning_rate": 1.5989847715736043e-06, "loss": -0.0078, "num_tokens": 310207137.0, "reward": 1.271397852897644, "reward_std": 0.13708409070968627, "rewards/accuracy_reward": 0.7222222208976745, "rewards/brier_reward": 0.828024935722351, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9925347208976746, "rewards/mean_confidence_reward": 0.6911779403686523, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003906249999999978, "completions/max_length": 3718.4, "completions/max_terminated_length": 3718.4, "completions/mean_length": 770.7046997070313, "completions/mean_terminated_length": 773.7459228515625, "completions/min_length": 0.0, "completions/min_terminated_length": 239.4, "epoch": 0.3599955000562493, "grad_norm": 0.0005380141083151102, "learning_rate": 1.4720812182741118e-06, "loss": -0.0025, "num_tokens": 322195991.0, "reward": 1.2562948942184449, "reward_std": 0.1399953231215477, "rewards/accuracy_reward": 0.7012152671813965, "rewards/brier_reward": 0.8152669310569763, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6894097328186035, "step": 150 }, { "epoch": 0.3599955000562493, "eval_completions/clipped_ratio": 0.008680555555555561, "eval_completions/max_length": 2509.0, "eval_completions/max_terminated_length": 2509.0, "eval_completions/mean_length": 739.3196309407552, "eval_completions/mean_terminated_length": 745.8552551269531, "eval_completions/min_length": 66.16666666666667, "eval_completions/min_terminated_length": 309.3333333333333, "eval_loss": 0.0, "eval_num_tokens": 322195991.0, "eval_reward": 1.2427564859390259, "eval_reward_std": 0.3247065891822179, "eval_rewards/accuracy_reward": 0.6866319477558136, "eval_rewards/brier_reward": 0.8075478275616964, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9913194477558136, "eval_rewards/mean_confidence_reward": 0.6826649407545725, "eval_runtime": 284.0655, "eval_samples_per_second": 3.52, "eval_steps_per_second": 0.021, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004166666666666652, "completions/max_length": 2905.4, "completions/max_terminated_length": 2905.4, "completions/mean_length": 732.27587890625, "completions/mean_terminated_length": 735.4041870117187, "completions/min_length": 0.0, "completions/min_terminated_length": 278.2, "epoch": 0.3719953500581243, "grad_norm": 0.0006105477805249393, "learning_rate": 1.3451776649746193e-06, "loss": -0.004, "num_tokens": 333739521.0, "reward": 1.2941375255584717, "reward_std": 0.13965744376182557, "rewards/accuracy_reward": 0.7543402791023255, "rewards/brier_reward": 0.8380874633789063, "rewards/confidence_one_or_zero": 0.0001736111124046147, "rewards/format_reward": 0.9958333253860474, "rewards/mean_confidence_reward": 0.6942404508590698, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008246527777777768, "completions/max_length": 2973.6, "completions/max_terminated_length": 2973.6, "completions/mean_length": 753.7942626953125, "completions/mean_terminated_length": 760.0675659179688, "completions/min_length": 0.0, "completions/min_terminated_length": 269.8, "epoch": 0.38399520005999926, "grad_norm": 0.0005153357633389533, "learning_rate": 1.218274111675127e-06, "loss": -0.0084, "num_tokens": 345510527.0, "reward": 1.2482587575912476, "reward_std": 0.14498619139194488, "rewards/accuracy_reward": 0.6890625, "rewards/brier_reward": 0.8156877160072327, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9917534708976745, "rewards/mean_confidence_reward": 0.6878429055213928, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007725694444444442, "completions/max_length": 3382.4, "completions/max_terminated_length": 3382.4, "completions/mean_length": 792.88369140625, "completions/mean_terminated_length": 799.1453125, "completions/min_length": 0.0, "completions/min_terminated_length": 315.0, "epoch": 0.39599505006187424, "grad_norm": 0.0005191068630665541, "learning_rate": 1.0913705583756345e-06, "loss": -0.0066, "num_tokens": 357783619.0, "reward": 1.2369651794433594, "reward_std": 0.14206757992506028, "rewards/accuracy_reward": 0.6712673664093017, "rewards/brier_reward": 0.8103755116462708, "rewards/confidence_one_or_zero": 0.0013020833430346102, "rewards/format_reward": 0.9922743082046509, "rewards/mean_confidence_reward": 0.6545373201370239, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005381944444444464, "completions/max_length": 3044.8, "completions/max_terminated_length": 3044.8, "completions/mean_length": 775.5628540039063, "completions/mean_terminated_length": 779.7671752929688, "completions/min_length": 0.0, "completions/min_terminated_length": 303.0, "epoch": 0.4079949000637492, "grad_norm": 0.0005426673451438546, "learning_rate": 9.644670050761422e-07, "loss": -0.0058, "num_tokens": 369807287.0, "reward": 1.2703901767730712, "reward_std": 0.1331357017159462, "rewards/accuracy_reward": 0.7223090291023254, "rewards/brier_reward": 0.8238399386405945, "rewards/confidence_one_or_zero": 0.0006076389108784497, "rewards/format_reward": 0.9946180701255798, "rewards/mean_confidence_reward": 0.664151918888092, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.007986111111111093, "completions/max_length": 3587.6, "completions/max_terminated_length": 3587.6, "completions/mean_length": 812.0387084960937, "completions/mean_terminated_length": 818.592236328125, "completions/min_length": 0.0, "completions/min_terminated_length": 253.6, "epoch": 0.4199947500656242, "grad_norm": 0.000529599201399833, "learning_rate": 8.375634517766498e-07, "loss": -0.0086, "num_tokens": 382269941.0, "reward": 1.2632122039794922, "reward_std": 0.1427206039428711, "rewards/accuracy_reward": 0.7166666626930237, "rewards/brier_reward": 0.8178175330162049, "rewards/confidence_one_or_zero": 0.0006944444554392249, "rewards/format_reward": 0.9919270992279052, "rewards/mean_confidence_reward": 0.6496527791023254, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00850694444444442, "completions/max_length": 3676.6, "completions/max_terminated_length": 3676.6, "completions/mean_length": 786.1634643554687, "completions/mean_terminated_length": 792.9117431640625, "completions/min_length": 0.0, "completions/min_terminated_length": 300.8, "epoch": 0.4319946000674992, "grad_norm": 0.0004947619745507836, "learning_rate": 7.106598984771574e-07, "loss": -0.0086, "num_tokens": 394426512.0, "reward": 1.269280982017517, "reward_std": 0.14375588595867156, "rewards/accuracy_reward": 0.7263888955116272, "rewards/brier_reward": 0.8207532525062561, "rewards/confidence_one_or_zero": 0.0011284722306299955, "rewards/format_reward": 0.991406238079071, "rewards/mean_confidence_reward": 0.6751345515251159, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011545138888888884, "completions/max_length": 3260.6, "completions/max_terminated_length": 3260.6, "completions/mean_length": 785.8776123046875, "completions/mean_terminated_length": 795.1585571289063, "completions/min_length": 0.0, "completions/min_terminated_length": 274.0, "epoch": 0.44399445006937416, "grad_norm": 0.0005510540795512497, "learning_rate": 5.83756345177665e-07, "loss": -0.0098, "num_tokens": 406569870.0, "reward": 1.2565922260284423, "reward_std": 0.1406260371208191, "rewards/accuracy_reward": 0.7071180582046509, "rewards/brier_reward": 0.8175978541374207, "rewards/confidence_one_or_zero": 0.00034722223062999547, "rewards/format_reward": 0.9884548544883728, "rewards/mean_confidence_reward": 0.68436199426651, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3258.4, "completions/max_terminated_length": 3258.4, "completions/mean_length": 780.2356811523438, "completions/mean_terminated_length": 786.3969848632812, "completions/min_length": 0.0, "completions/min_terminated_length": 293.6, "epoch": 0.45599430007124914, "grad_norm": 0.0005112453945912421, "learning_rate": 4.568527918781726e-07, "loss": -0.007, "num_tokens": 418641129.0, "reward": 1.2805275201797486, "reward_std": 0.14128611385822296, "rewards/accuracy_reward": 0.74140625, "rewards/brier_reward": 0.8274471998214722, "rewards/confidence_one_or_zero": 8.680555620230735e-05, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.7002317667007446, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01128472222222221, "completions/max_length": 3505.8, "completions/max_terminated_length": 3505.8, "completions/mean_length": 798.0532104492188, "completions/mean_terminated_length": 807.356103515625, "completions/min_length": 0.0, "completions/min_terminated_length": 268.8, "epoch": 0.46799415007312406, "grad_norm": 0.0005772958975285292, "learning_rate": 3.2994923857868026e-07, "loss": -0.011, "num_tokens": 430915566.0, "reward": 1.2434212923049928, "reward_std": 0.15206148028373717, "rewards/accuracy_reward": 0.6927083253860473, "rewards/brier_reward": 0.8054919719696045, "rewards/confidence_one_or_zero": 0.0004340277868323028, "rewards/format_reward": 0.9886284708976746, "rewards/mean_confidence_reward": 0.6865755438804626, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006857638888888884, "completions/max_length": 3417.0, "completions/max_terminated_length": 3417.0, "completions/mean_length": 776.532470703125, "completions/mean_terminated_length": 781.9117309570313, "completions/min_length": 0.0, "completions/min_terminated_length": 274.6, "epoch": 0.47999400007499904, "grad_norm": 0.0006025061593391001, "learning_rate": 2.0304568527918783e-07, "loss": -0.007, "num_tokens": 442929028.0, "reward": 1.2590484380722047, "reward_std": 0.14006066024303437, "rewards/accuracy_reward": 0.7050347089767456, "rewards/brier_reward": 0.8199058055877686, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9931423544883728, "rewards/mean_confidence_reward": 0.6951475501060486, "step": 200 }, { "epoch": 0.47999400007499904, "eval_completions/clipped_ratio": 0.006770833333333337, "eval_completions/max_length": 2710.8333333333335, "eval_completions/max_terminated_length": 2710.8333333333335, "eval_completions/mean_length": 788.9212036132812, "eval_completions/mean_terminated_length": 794.1668294270834, "eval_completions/min_length": 158.66666666666666, "eval_completions/min_terminated_length": 343.0, "eval_loss": 0.0, "eval_num_tokens": 442929028.0, "eval_reward": 1.254876693089803, "eval_reward_std": 0.30982651313145954, "eval_rewards/accuracy_reward": 0.6918402711550394, "eval_rewards/brier_reward": 0.8231076200803121, "eval_rewards/confidence_one_or_zero": 0.0008680555814256271, "eval_rewards/format_reward": 0.9947916666666666, "eval_rewards/mean_confidence_reward": 0.6819444298744202, "eval_runtime": 275.8753, "eval_samples_per_second": 3.625, "eval_steps_per_second": 0.022, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0052951388888889065, "completions/max_length": 3538.0, "completions/max_terminated_length": 3538.0, "completions/mean_length": 799.853662109375, "completions/mean_terminated_length": 804.1119750976562, "completions/min_length": 0.0, "completions/min_terminated_length": 265.2, "epoch": 0.491993850076874, "grad_norm": 0.0005411679157987237, "learning_rate": 7.614213197969544e-08, "loss": -0.0035, "num_tokens": 455209294.0, "reward": 1.2997782230377197, "reward_std": 0.1340979665517807, "rewards/accuracy_reward": 0.7611111044883728, "rewards/brier_reward": 0.8437265396118164, "rewards/confidence_one_or_zero": 0.0005208333546761424, "rewards/format_reward": 0.9947048664093018, "rewards/mean_confidence_reward": 0.6931857466697693, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003906249999999963, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 792.8047078450521, "completions/mean_terminated_length": 795.9739786783854, "completions/min_length": 0.0, "completions/min_terminated_length": 277.6666666666667, "epoch": 0.49919376007799904, "num_tokens": 462555208.0, "reward": 1.2620628277460735, "reward_std": 0.13802013049523035, "rewards/accuracy_reward": 0.7123842636744181, "rewards/brier_reward": 0.8156337340672811, "rewards/confidence_one_or_zero": 0.0015914352067435782, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6858723759651184, "step": 208, "total_flos": 0.0, "train_loss": -0.01281338286263725, "train_runtime": 80412.2515, "train_samples_per_second": 0.187, "train_steps_per_second": 0.003 } ], "logging_steps": 5, "max_steps": 208, "num_input_tokens_seen": 462555208, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }