{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5056, "eval_steps": 15, "global_step": 79, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.36458333333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 2780.46484375, "completions/mean_terminated_length": 2025.6495361328125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.0064, "grad_norm": 0.03387239947915077, "learning_rate": 0.0, "loss": 0.0002, "num_tokens": 4520194.0, "reward": 0.0608723983168602, "reward_std": 0.09049739688634872, "rewards/accuracy_reward": 0.0494791679084301, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.072265625, "rewards/mean_confidence_reward": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.33463541666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 2726.617919921875, "completions/mean_terminated_length": 2037.906982421875, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.0128, "grad_norm": 0.029681719839572906, "learning_rate": 3.125e-07, "loss": 0.0038, "num_tokens": 8959671.0, "reward": 0.0703125, "reward_std": 0.09117179363965988, "rewards/accuracy_reward": 0.064453125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.076171875, "rewards/mean_confidence_reward": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 2753.60546875, "completions/mean_terminated_length": 2050.446533203125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.0192, "grad_norm": 0.040974151343107224, "learning_rate": 6.25e-07, "loss": -0.0006, "num_tokens": 13444641.0, "reward": 0.0826822966337204, "reward_std": 0.12302601337432861, "rewards/accuracy_reward": 0.0748697891831398, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.0904947891831398, "rewards/mean_confidence_reward": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.32682291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 2688.072265625, "completions/mean_terminated_length": 2004.532958984375, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.0256, "grad_norm": 0.03401859849691391, "learning_rate": 9.375000000000001e-07, "loss": 0.0047, "num_tokens": 17810840.0, "reward": 0.0963541716337204, "reward_std": 0.11852055788040161, "rewards/accuracy_reward": 0.0833333358168602, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.109375, "rewards/mean_confidence_reward": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 2876.296875, "completions/mean_terminated_length": 2119.77197265625, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.032, "grad_norm": 0.033929161727428436, "learning_rate": 1.25e-06, "loss": 0.0041, "num_tokens": 22478032.0, "reward": 0.1051432341337204, "reward_std": 0.1347123086452484, "rewards/accuracy_reward": 0.095703125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.1145833358168602, "rewards/mean_confidence_reward": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.37825520833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2795.106201171875, "completions/mean_terminated_length": 2003.6722412109375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.0384, "grad_norm": 0.04229738935828209, "learning_rate": 1.5625e-06, "loss": 0.0039, "num_tokens": 27022347.0, "reward": 0.2288411557674408, "reward_std": 0.17552155256271362, "rewards/accuracy_reward": 0.2174479216337204, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.240234375, "rewards/mean_confidence_reward": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.36588541666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 2775.00537109375, "completions/mean_terminated_length": 2012.78857421875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.0448, "grad_norm": 0.03459632769227028, "learning_rate": 1.8750000000000003e-06, "loss": 0.0124, "num_tokens": 31537667.0, "reward": 0.3424479365348816, "reward_std": 0.17551496624946594, "rewards/accuracy_reward": 0.2955729067325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.3893229067325592, "rewards/mean_confidence_reward": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34700520833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 2669.205810546875, "completions/mean_terminated_length": 1910.9990234375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.0512, "grad_norm": 0.03504309430718422, "learning_rate": 2.1875000000000002e-06, "loss": 0.0106, "num_tokens": 35883055.0, "reward": 0.4195963740348816, "reward_std": 0.16831031441688538, "rewards/accuracy_reward": 0.37109375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.4680989682674408, "rewards/mean_confidence_reward": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.27408854166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2454.344482421875, "completions/mean_terminated_length": 1834.490478515625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.0576, "grad_norm": 0.1767028123140335, "learning_rate": 2.5e-06, "loss": 0.0059, "num_tokens": 39906592.0, "reward": 0.5374349355697632, "reward_std": 0.17072224617004395, "rewards/accuracy_reward": 0.50390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5709635615348816, "rewards/mean_confidence_reward": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.32877604166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 2579.9296875, "completions/mean_terminated_length": 1837.334716796875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.064, "grad_norm": 0.030384620651602745, "learning_rate": 2.8125e-06, "loss": 0.0096, "num_tokens": 44126708.0, "reward": 0.5865885615348816, "reward_std": 0.14248789846897125, "rewards/accuracy_reward": 0.5494791865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.6236979365348816, "rewards/mean_confidence_reward": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28971354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 2423.814453125, "completions/mean_terminated_length": 1741.7589111328125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.0704, "grad_norm": 0.023964334279298782, "learning_rate": 3.125e-06, "loss": 0.0076, "num_tokens": 48099087.0, "reward": 0.6575521230697632, "reward_std": 0.11561574786901474, "rewards/accuracy_reward": 0.6139323115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.701171875, "rewards/mean_confidence_reward": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.26627604166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 2389.55615234375, "completions/mean_terminated_length": 1770.269775390625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0768, "grad_norm": 0.02120368555188179, "learning_rate": 3.4375e-06, "loss": 0.0055, "num_tokens": 52013309.0, "reward": 0.669921875, "reward_std": 0.10116906464099884, "rewards/accuracy_reward": 0.6080729365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.7317708134651184, "rewards/mean_confidence_reward": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20703125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1986.2142333984375, "completions/mean_terminated_length": 1435.3834228515625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0832, "grad_norm": 0.030623242259025574, "learning_rate": 3.7500000000000005e-06, "loss": 0.0032, "num_tokens": 55310054.0, "reward": 0.72265625, "reward_std": 0.10442712903022766, "rewards/accuracy_reward": 0.6575520634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.7877604365348816, "rewards/mean_confidence_reward": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.154296875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1930.3646240234375, "completions/mean_terminated_length": 1535.2486572265625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0896, "grad_norm": 0.022816797718405724, "learning_rate": 4.0625000000000005e-06, "loss": 0.0094, "num_tokens": 58523078.0, "reward": 0.7763671875, "reward_std": 0.08641418814659119, "rewards/accuracy_reward": 0.708984375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.84375, "rewards/mean_confidence_reward": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19791666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 2009.919921875, "completions/mean_terminated_length": 1495.1728515625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.096, "grad_norm": 0.02108619175851345, "learning_rate": 4.3750000000000005e-06, "loss": 0.0077, "num_tokens": 61865667.0, "reward": 0.748046875, "reward_std": 0.0890018567442894, "rewards/accuracy_reward": 0.6946614384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.8014323115348816, "rewards/mean_confidence_reward": 0.0, "step": 15 }, { "epoch": 0.096, "eval_completions/clipped_ratio": 0.1730018028846154, "eval_completions/max_length": 4096.0, "eval_completions/max_terminated_length": 3884.25, "eval_completions/mean_length": 1958.4808502197266, "eval_completions/mean_terminated_length": 1510.7203216552734, "eval_completions/min_length": 328.0, "eval_completions/min_terminated_length": 328.0, "eval_loss": 0.0, "eval_num_tokens": 61865667.0, "eval_reward": 0.7666015625, "eval_reward_std": 0.36640918254852295, "eval_rewards/accuracy_reward": 0.70703125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.826171875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 784.7448, "eval_samples_per_second": 1.274, "eval_steps_per_second": 0.01, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.18294270833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 1902.54296875, "completions/mean_terminated_length": 1411.4183349609375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.1024, "grad_norm": 0.02185441181063652, "learning_rate": 4.6875000000000004e-06, "loss": 0.0082, "num_tokens": 65043269.0, "reward": 0.7708333730697632, "reward_std": 0.08093124628067017, "rewards/accuracy_reward": 0.7272135615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.814453125, "rewards/mean_confidence_reward": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21484375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 2011.791015625, "completions/mean_terminated_length": 1441.485107421875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.1088, "grad_norm": 0.01952357590198517, "learning_rate": 5e-06, "loss": 0.0102, "num_tokens": 68385996.0, "reward": 0.7210286855697632, "reward_std": 0.07699891179800034, "rewards/accuracy_reward": 0.6569010615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.78515625, "rewards/mean_confidence_reward": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14778645833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1735.443359375, "completions/mean_terminated_length": 1326.087890625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1152, "grad_norm": 0.01978280581533909, "learning_rate": 4.920634920634921e-06, "loss": 0.0059, "num_tokens": 71295525.0, "reward": 0.7796224355697632, "reward_std": 0.07605080306529999, "rewards/accuracy_reward": 0.70703125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.8522135615348816, "rewards/mean_confidence_reward": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.150390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1611.516357421875, "completions/mean_terminated_length": 1171.734130859375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.1216, "grad_norm": 0.02145254611968994, "learning_rate": 4.841269841269842e-06, "loss": 0.0045, "num_tokens": 74020350.0, "reward": 0.7809244990348816, "reward_std": 0.06806011497974396, "rewards/accuracy_reward": 0.712890625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.8489583134651184, "rewards/mean_confidence_reward": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13802083333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 1633.2982177734375, "completions/mean_terminated_length": 1238.96826171875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.128, "grad_norm": 0.020902708172798157, "learning_rate": 4.761904761904762e-06, "loss": 0.0062, "num_tokens": 76775704.0, "reward": 0.7747396230697632, "reward_std": 0.07498719543218613, "rewards/accuracy_reward": 0.6875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.8619791865348816, "rewards/mean_confidence_reward": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10872395833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 1514.591796875, "completions/mean_terminated_length": 1199.6939697265625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.1344, "grad_norm": 0.018049517646431923, "learning_rate": 4.682539682539683e-06, "loss": 0.0044, "num_tokens": 79351357.0, "reward": 0.8141276240348816, "reward_std": 0.06124332174658775, "rewards/accuracy_reward": 0.7369791865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.8912760615348816, "rewards/mean_confidence_reward": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09895833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4038.0, "completions/mean_length": 1392.6966552734375, "completions/mean_terminated_length": 1095.802001953125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.1408, "grad_norm": 0.01976490393280983, "learning_rate": 4.603174603174604e-06, "loss": 0.0073, "num_tokens": 81734811.0, "reward": 0.8177083730697632, "reward_std": 0.06400330364704132, "rewards/accuracy_reward": 0.734375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9010416865348816, "rewards/mean_confidence_reward": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4019.0, "completions/mean_length": 1432.442138671875, "completions/mean_terminated_length": 1163.22216796875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.1472, "grad_norm": 0.01790717802941799, "learning_rate": 4.523809523809524e-06, "loss": 0.0055, "num_tokens": 84190490.0, "reward": 0.8294271230697632, "reward_std": 0.05695289000868797, "rewards/accuracy_reward": 0.751953125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9069010615348816, "rewards/mean_confidence_reward": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 1136.582763671875, "completions/mean_terminated_length": 945.8510131835938, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1536, "grad_norm": 0.019688351079821587, "learning_rate": 4.444444444444444e-06, "loss": 0.0073, "num_tokens": 86182577.0, "reward": 0.8580729365348816, "reward_std": 0.060081034898757935, "rewards/accuracy_reward": 0.7766926884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.939453125, "rewards/mean_confidence_reward": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 1181.623046875, "completions/mean_terminated_length": 961.2080078125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.16, "grad_norm": 0.01960933580994606, "learning_rate": 4.365079365079366e-06, "loss": 0.0064, "num_tokens": 88240142.0, "reward": 0.8430989980697632, "reward_std": 0.06078281253576279, "rewards/accuracy_reward": 0.7565104365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9296875, "rewards/mean_confidence_reward": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05924479166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1170.4818115234375, "completions/mean_terminated_length": 986.2449951171875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1664, "grad_norm": 0.019044535234570503, "learning_rate": 4.2857142857142855e-06, "loss": 0.0052, "num_tokens": 90288618.0, "reward": 0.8356119990348816, "reward_std": 0.06481396406888962, "rewards/accuracy_reward": 0.73046875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9407551884651184, "rewards/mean_confidence_reward": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 848.7389526367188, "completions/mean_terminated_length": 790.6369018554688, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1728, "grad_norm": 0.02053948864340782, "learning_rate": 4.206349206349207e-06, "loss": 0.002, "num_tokens": 91833401.0, "reward": 0.8896484375, "reward_std": 0.04453360289335251, "rewards/accuracy_reward": 0.796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.982421875, "rewards/mean_confidence_reward": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 1033.705810546875, "completions/mean_terminated_length": 909.2222290039062, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.1792, "grad_norm": 0.01965363323688507, "learning_rate": 4.126984126984127e-06, "loss": 0.0043, "num_tokens": 93668645.0, "reward": 0.8759765625, "reward_std": 0.06271578371524811, "rewards/accuracy_reward": 0.791015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9609375, "rewards/mean_confidence_reward": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03190104166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4064.0, "completions/mean_length": 944.6478271484375, "completions/mean_terminated_length": 840.8035888671875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1856, "grad_norm": 0.017758071422576904, "learning_rate": 4.047619047619048e-06, "loss": 0.0019, "num_tokens": 95360064.0, "reward": 0.8597005605697632, "reward_std": 0.048148658126592636, "rewards/accuracy_reward": 0.7513020634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9680989384651184, "rewards/mean_confidence_reward": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02799479166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 963.6784057617188, "completions/mean_terminated_length": 873.4641723632812, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.192, "grad_norm": 0.020760418847203255, "learning_rate": 3.968253968253968e-06, "loss": 0.0067, "num_tokens": 97084650.0, "reward": 0.8678385615348816, "reward_std": 0.05553457885980606, "rewards/accuracy_reward": 0.763671875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9720051884651184, "rewards/mean_confidence_reward": 0.0, "step": 30 }, { "epoch": 0.192, "eval_completions/clipped_ratio": 0.021709735576923073, "eval_completions/max_length": 4096.0, "eval_completions/max_terminated_length": 3616.375, "eval_completions/mean_length": 845.1283874511719, "eval_completions/mean_terminated_length": 772.5844650268555, "eval_completions/min_length": 154.25, "eval_completions/min_terminated_length": 154.25, "eval_loss": 0.0, "eval_num_tokens": 97084650.0, "eval_reward": 0.87646484375, "eval_reward_std": 0.21883795596659184, "eval_rewards/accuracy_reward": 0.7744140625, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.978515625, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 659.9975, "eval_samples_per_second": 1.515, "eval_steps_per_second": 0.012, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 878.7572021484375, "completions/mean_terminated_length": 810.30517578125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1984, "grad_norm": 0.020707620307803154, "learning_rate": 3.88888888888889e-06, "loss": 0.003, "num_tokens": 98685229.0, "reward": 0.8831380605697632, "reward_std": 0.05947504937648773, "rewards/accuracy_reward": 0.7884114384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9778645634651184, "rewards/mean_confidence_reward": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4014.0, "completions/mean_length": 817.9010620117188, "completions/mean_terminated_length": 792.0892333984375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.2048, "grad_norm": 0.01970040611922741, "learning_rate": 3.80952380952381e-06, "loss": 0.0001, "num_tokens": 100184125.0, "reward": 0.8935546875, "reward_std": 0.04857983812689781, "rewards/accuracy_reward": 0.794921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 802.2428588867188, "completions/mean_terminated_length": 749.9609375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.2112, "grad_norm": 0.02303779311478138, "learning_rate": 3.7301587301587305e-06, "loss": 0.0039, "num_tokens": 101661458.0, "reward": 0.8557943105697632, "reward_std": 0.06328541040420532, "rewards/accuracy_reward": 0.7272135615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 787.0625, "completions/mean_terminated_length": 734.5396728515625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.2176, "grad_norm": 0.018765997141599655, "learning_rate": 3.6507936507936507e-06, "loss": 0.0032, "num_tokens": 103115794.0, "reward": 0.8919271230697632, "reward_std": 0.043571051210165024, "rewards/accuracy_reward": 0.7994791865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 764.9583740234375, "completions/mean_terminated_length": 732.1078491210938, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.224, "grad_norm": 0.02199125476181507, "learning_rate": 3.5714285714285718e-06, "loss": 0.001, "num_tokens": 104538930.0, "reward": 0.8860677480697632, "reward_std": 0.05475646257400513, "rewards/accuracy_reward": 0.7819010615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3941.0, "completions/mean_length": 731.091796875, "completions/mean_terminated_length": 709.0413208007812, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.2304, "grad_norm": 0.019916215911507607, "learning_rate": 3.492063492063492e-06, "loss": 0.0015, "num_tokens": 105911879.0, "reward": 0.8876953125, "reward_std": 0.04180898517370224, "rewards/accuracy_reward": 0.7819010615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3785.0, "completions/mean_length": 782.7923583984375, "completions/mean_terminated_length": 745.7122802734375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.2368, "grad_norm": 0.021914253011345863, "learning_rate": 3.412698412698413e-06, "loss": 0.0017, "num_tokens": 107360480.0, "reward": 0.88671875, "reward_std": 0.053066499531269073, "rewards/accuracy_reward": 0.7845051884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3992.0, "completions/mean_length": 767.8822021484375, "completions/mean_terminated_length": 741.676513671875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2432, "grad_norm": 0.023255277425050735, "learning_rate": 3.3333333333333333e-06, "loss": 0.0007, "num_tokens": 108788163.0, "reward": 0.8880208730697632, "reward_std": 0.05833045765757561, "rewards/accuracy_reward": 0.7838541865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01888020833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 821.6953125, "completions/mean_terminated_length": 758.6861572265625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.2496, "grad_norm": 0.02124691754579544, "learning_rate": 3.2539682539682544e-06, "loss": 0.0031, "num_tokens": 110295215.0, "reward": 0.876953125, "reward_std": 0.056418370455503464, "rewards/accuracy_reward": 0.7727864384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9811198115348816, "rewards/mean_confidence_reward": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 746.0670776367188, "completions/mean_terminated_length": 713.0302734375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.256, "grad_norm": 0.021513385698199272, "learning_rate": 3.1746031746031746e-06, "loss": 0.0034, "num_tokens": 111691110.0, "reward": 0.8782552480697632, "reward_std": 0.05017038434743881, "rewards/accuracy_reward": 0.7662760615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 741.466796875, "completions/mean_terminated_length": 703.9242553710938, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.2624, "grad_norm": 0.019013214856386185, "learning_rate": 3.0952380952380957e-06, "loss": 0.0013, "num_tokens": 113077179.0, "reward": 0.8899739980697632, "reward_std": 0.038621604442596436, "rewards/accuracy_reward": 0.791015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3970.0, "completions/mean_length": 693.7057495117188, "completions/mean_terminated_length": 660.152587890625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.2688, "grad_norm": 0.022837555035948753, "learning_rate": 3.015873015873016e-06, "loss": 0.0005, "num_tokens": 114389551.0, "reward": 0.8785807490348816, "reward_std": 0.045825596898794174, "rewards/accuracy_reward": 0.7669270634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3969.0, "completions/mean_length": 813.830078125, "completions/mean_terminated_length": 761.7321166992188, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.2752, "grad_norm": 0.020699184387922287, "learning_rate": 2.936507936507937e-06, "loss": 0.0034, "num_tokens": 115888346.0, "reward": 0.8785807490348816, "reward_std": 0.050285469740629196, "rewards/accuracy_reward": 0.7727864384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3888.0, "completions/mean_length": 724.4251708984375, "completions/mean_terminated_length": 695.6461181640625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.2816, "grad_norm": 0.024175629019737244, "learning_rate": 2.8571428571428573e-06, "loss": 0.0019, "num_tokens": 117249167.0, "reward": 0.9088541865348816, "reward_std": 0.054964929819107056, "rewards/accuracy_reward": 0.826171875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 746.92578125, "completions/mean_terminated_length": 709.4443359375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.288, "grad_norm": 0.021540403366088867, "learning_rate": 2.7777777777777783e-06, "loss": 0.0011, "num_tokens": 118642173.0, "reward": 0.8756510615348816, "reward_std": 0.05108930170536041, "rewards/accuracy_reward": 0.7623698115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.0, "step": 45 }, { "epoch": 0.288, "eval_completions/clipped_ratio": 0.005859375, "eval_completions/max_length": 3775.5, "eval_completions/max_terminated_length": 3543.375, "eval_completions/mean_length": 740.3269271850586, "eval_completions/mean_terminated_length": 720.5047912597656, "eval_completions/min_length": 186.875, "eval_completions/min_terminated_length": 186.875, "eval_loss": 0.0, "eval_num_tokens": 118642173.0, "eval_reward": 0.88916015625, "eval_reward_std": 0.19981716014444828, "eval_rewards/accuracy_reward": 0.7841796875, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.994140625, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 603.9532, "eval_samples_per_second": 1.656, "eval_steps_per_second": 0.013, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4011.0, "completions/mean_length": 716.748046875, "completions/mean_terminated_length": 696.8310546875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.2944, "grad_norm": 0.022779908031225204, "learning_rate": 2.6984126984126986e-06, "loss": 0.0004, "num_tokens": 119996194.0, "reward": 0.9026693105697632, "reward_std": 0.05144798010587692, "rewards/accuracy_reward": 0.8111979365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 770.3229370117188, "completions/mean_terminated_length": 746.3344116210938, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.3008, "grad_norm": 0.01919720135629177, "learning_rate": 2.6190476190476192e-06, "loss": 0.001, "num_tokens": 121428754.0, "reward": 0.8958333730697632, "reward_std": 0.04597648233175278, "rewards/accuracy_reward": 0.798828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01822916666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 835.46875, "completions/mean_terminated_length": 774.9283447265625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.3072, "grad_norm": 0.02324065938591957, "learning_rate": 2.53968253968254e-06, "loss": 0.0024, "num_tokens": 122966546.0, "reward": 0.857421875, "reward_std": 0.062466781586408615, "rewards/accuracy_reward": 0.7330729365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9817708134651184, "rewards/mean_confidence_reward": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01627604166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 792.6647338867188, "completions/mean_terminated_length": 738.0099487304688, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.3136, "grad_norm": 0.022559164091944695, "learning_rate": 2.4603174603174605e-06, "loss": 0.0016, "num_tokens": 124435511.0, "reward": 0.890625, "reward_std": 0.05695483088493347, "rewards/accuracy_reward": 0.7975260615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3965.0, "completions/mean_length": 756.9381713867188, "completions/mean_terminated_length": 717.3445434570312, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.32, "grad_norm": 0.023683471605181694, "learning_rate": 2.380952380952381e-06, "loss": 0.0034, "num_tokens": 125844928.0, "reward": 0.8766276240348816, "reward_std": 0.0529298409819603, "rewards/accuracy_reward": 0.7649739384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3821.0, "completions/mean_length": 673.533203125, "completions/mean_terminated_length": 642.0518798828125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.3264, "grad_norm": 0.0213849525898695, "learning_rate": 2.301587301587302e-06, "loss": 0.002, "num_tokens": 127124315.0, "reward": 0.8948568105697632, "reward_std": 0.04318075627088547, "rewards/accuracy_reward": 0.798828125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 713.7291870117188, "completions/mean_terminated_length": 693.7943725585938, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.3328, "grad_norm": 0.02206575684249401, "learning_rate": 2.222222222222222e-06, "loss": 0.0026, "num_tokens": 128472651.0, "reward": 0.8961588740348816, "reward_std": 0.0437723845243454, "rewards/accuracy_reward": 0.7981770634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3920.0, "completions/mean_length": 780.7955932617188, "completions/mean_terminated_length": 765.6180419921875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.3392, "grad_norm": 0.017915785312652588, "learning_rate": 2.1428571428571427e-06, "loss": 0.0014, "num_tokens": 129916081.0, "reward": 0.9124349355697632, "reward_std": 0.04158523678779602, "rewards/accuracy_reward": 0.8294270634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3977.0, "completions/mean_length": 673.7780151367188, "completions/mean_terminated_length": 651.3519287109375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.3456, "grad_norm": 0.020742084830999374, "learning_rate": 2.0634920634920634e-06, "loss": 0.0008, "num_tokens": 131197388.0, "reward": 0.8958333730697632, "reward_std": 0.04164648801088333, "rewards/accuracy_reward": 0.7981770634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 732.0514526367188, "completions/mean_terminated_length": 705.5636596679688, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.352, "grad_norm": 0.019956564530730247, "learning_rate": 1.984126984126984e-06, "loss": 0.001, "num_tokens": 132571659.0, "reward": 0.8857421875, "reward_std": 0.045707326382398605, "rewards/accuracy_reward": 0.779296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4006.0, "completions/mean_length": 767.5592651367188, "completions/mean_terminated_length": 750.1328735351562, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.3584, "grad_norm": 0.01656208373606205, "learning_rate": 1.904761904761905e-06, "loss": 0.0026, "num_tokens": 134000862.0, "reward": 0.9020182490348816, "reward_std": 0.038941457867622375, "rewards/accuracy_reward": 0.8092448115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3850.0, "completions/mean_length": 661.19921875, "completions/mean_terminated_length": 652.2310791015625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.3648, "grad_norm": 0.018361348658800125, "learning_rate": 1.8253968253968254e-06, "loss": 0.0015, "num_tokens": 135264776.0, "reward": 0.9013671875, "reward_std": 0.03878443315625191, "rewards/accuracy_reward": 0.8053385615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01302083333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 766.0013427734375, "completions/mean_terminated_length": 722.0699462890625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.3712, "grad_norm": 0.020110923796892166, "learning_rate": 1.746031746031746e-06, "loss": 0.0031, "num_tokens": 136692626.0, "reward": 0.9104818105697632, "reward_std": 0.05508416146039963, "rewards/accuracy_reward": 0.833984375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3904.0, "completions/mean_length": 759.0625, "completions/mean_terminated_length": 739.3948974609375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3776, "grad_norm": 0.02654006890952587, "learning_rate": 1.6666666666666667e-06, "loss": 0.0013, "num_tokens": 138104586.0, "reward": 0.8876953125, "reward_std": 0.044509004801511765, "rewards/accuracy_reward": 0.78125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3975.0, "completions/mean_length": 840.5924682617188, "completions/mean_terminated_length": 810.6478271484375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.384, "grad_norm": 0.021109433844685555, "learning_rate": 1.5873015873015873e-06, "loss": 0.0025, "num_tokens": 139644632.0, "reward": 0.8779296875, "reward_std": 0.05428009480237961, "rewards/accuracy_reward": 0.7649739384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.0, "step": 60 }, { "epoch": 0.384, "eval_completions/clipped_ratio": 0.005859375, "eval_completions/max_length": 3920.5, "eval_completions/max_terminated_length": 3464.5, "eval_completions/mean_length": 786.6544494628906, "eval_completions/mean_terminated_length": 767.0381546020508, "eval_completions/min_length": 187.375, "eval_completions/min_terminated_length": 187.375, "eval_loss": 0.0, "eval_num_tokens": 139644632.0, "eval_reward": 0.884765625, "eval_reward_std": 0.2027157824486494, "eval_rewards/accuracy_reward": 0.775390625, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.994140625, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 622.9608, "eval_samples_per_second": 1.605, "eval_steps_per_second": 0.013, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01236979166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 900.9095458984375, "completions/mean_terminated_length": 860.891845703125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3904, "grad_norm": 0.019216015934944153, "learning_rate": 1.507936507936508e-06, "loss": 0.0014, "num_tokens": 141286341.0, "reward": 0.861328125, "reward_std": 0.05327238887548447, "rewards/accuracy_reward": 0.7350260615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9876301884651184, "rewards/mean_confidence_reward": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 839.0540771484375, "completions/mean_terminated_length": 780.7786865234375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.3968, "grad_norm": 0.021654291078448296, "learning_rate": 1.4285714285714286e-06, "loss": 0.0025, "num_tokens": 142824816.0, "reward": 0.8942057490348816, "reward_std": 0.06182871386408806, "rewards/accuracy_reward": 0.8059895634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.982421875, "rewards/mean_confidence_reward": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 764.9212646484375, "completions/mean_terminated_length": 732.0703735351562, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.4032, "grad_norm": 0.018562061712145805, "learning_rate": 1.3492063492063493e-06, "loss": 0.0021, "num_tokens": 144247383.0, "reward": 0.8984375, "reward_std": 0.041560571640729904, "rewards/accuracy_reward": 0.806640625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4011.0, "completions/mean_length": 841.3509521484375, "completions/mean_terminated_length": 820.02294921875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.4096, "grad_norm": 0.018766365945339203, "learning_rate": 1.26984126984127e-06, "loss": 0.0012, "num_tokens": 145786162.0, "reward": 0.9117838740348816, "reward_std": 0.035551950335502625, "rewards/accuracy_reward": 0.830078125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3912.0, "completions/mean_length": 796.5963745117188, "completions/mean_terminated_length": 744.224853515625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.416, "grad_norm": 0.021488916128873825, "learning_rate": 1.1904761904761906e-06, "loss": 0.0036, "num_tokens": 147262862.0, "reward": 0.8873698115348816, "reward_std": 0.04491402953863144, "rewards/accuracy_reward": 0.7903645634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3934.0, "completions/mean_length": 757.4036865234375, "completions/mean_terminated_length": 746.5003051757812, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.4224, "grad_norm": 0.014640610665082932, "learning_rate": 1.111111111111111e-06, "loss": 0.0013, "num_tokens": 148673874.0, "reward": 0.9049479365348816, "reward_std": 0.029130559414625168, "rewards/accuracy_reward": 0.8131510615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3851.0, "completions/mean_length": 728.4642333984375, "completions/mean_terminated_length": 721.8741455078125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.4288, "grad_norm": 0.017500903457403183, "learning_rate": 1.0317460317460317e-06, "loss": 0.0009, "num_tokens": 150038443.0, "reward": 0.9033203125, "reward_std": 0.03719018027186394, "rewards/accuracy_reward": 0.80859375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01497395833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 797.9720458984375, "completions/mean_terminated_length": 747.8367919921875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.4352, "grad_norm": 0.018346579745411873, "learning_rate": 9.523809523809525e-07, "loss": 0.0037, "num_tokens": 151514616.0, "reward": 0.880859375, "reward_std": 0.048622481524944305, "rewards/accuracy_reward": 0.7766926884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9850260615348816, "rewards/mean_confidence_reward": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01627604166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3999.0, "completions/mean_length": 867.1784057617188, "completions/mean_terminated_length": 813.7564697265625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.4416, "grad_norm": 0.019951898604631424, "learning_rate": 8.73015873015873e-07, "loss": -0.0004, "num_tokens": 153096354.0, "reward": 0.8893229365348816, "reward_std": 0.053706176578998566, "rewards/accuracy_reward": 0.794921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3932.0, "completions/mean_length": 775.0885620117188, "completions/mean_terminated_length": 755.515380859375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.448, "grad_norm": 0.019857991486787796, "learning_rate": 7.936507936507937e-07, "loss": 0.0015, "num_tokens": 154536874.0, "reward": 0.9117838740348816, "reward_std": 0.050319306552410126, "rewards/accuracy_reward": 0.8294270634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3974.0, "completions/mean_length": 759.080078125, "completions/mean_terminated_length": 735.010498046875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.4544, "grad_norm": 0.017306774854660034, "learning_rate": 7.142857142857143e-07, "loss": 0.0006, "num_tokens": 155947013.0, "reward": 0.9140625, "reward_std": 0.03777480125427246, "rewards/accuracy_reward": 0.8352864384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3886.0, "completions/mean_length": 777.9954833984375, "completions/mean_terminated_length": 747.4750366210938, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.4608, "grad_norm": 0.01818043924868107, "learning_rate": 6.34920634920635e-07, "loss": 0.0022, "num_tokens": 157392574.0, "reward": 0.89453125, "reward_std": 0.03996651619672775, "rewards/accuracy_reward": 0.7981770634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01627604166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 819.3580932617188, "completions/mean_terminated_length": 765.1449584960938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.4672, "grad_norm": 0.018845003098249435, "learning_rate": 5.555555555555555e-07, "loss": 0.0013, "num_tokens": 158892196.0, "reward": 0.8815104365348816, "reward_std": 0.04560422524809837, "rewards/accuracy_reward": 0.779296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 759.9166870117188, "completions/mean_terminated_length": 746.833984375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.4736, "grad_norm": 0.017951594665646553, "learning_rate": 4.7619047619047623e-07, "loss": 0.0021, "num_tokens": 160304956.0, "reward": 0.9007161855697632, "reward_std": 0.03838188201189041, "rewards/accuracy_reward": 0.8053385615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3974.0, "completions/mean_length": 832.7454833984375, "completions/mean_terminated_length": 811.361083984375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.48, "grad_norm": 0.01818317361176014, "learning_rate": 3.9682539682539683e-07, "loss": 0.001, "num_tokens": 161828517.0, "reward": 0.8746744990348816, "reward_std": 0.04868558794260025, "rewards/accuracy_reward": 0.755859375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 75 }, { "epoch": 0.48, "eval_completions/clipped_ratio": 0.0068359375, "eval_completions/max_length": 3771.875, "eval_completions/max_terminated_length": 3558.375, "eval_completions/mean_length": 793.0995407104492, "eval_completions/mean_terminated_length": 770.1827087402344, "eval_completions/min_length": 188.125, "eval_completions/min_terminated_length": 188.125, "eval_loss": 0.0, "eval_num_tokens": 161828517.0, "eval_reward": 0.880859375, "eval_reward_std": 0.20689411088824272, "eval_rewards/accuracy_reward": 0.76953125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9921875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 625.2922, "eval_samples_per_second": 1.599, "eval_steps_per_second": 0.013, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4003.0, "completions/mean_length": 772.556640625, "completions/mean_terminated_length": 748.5842895507812, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.4864, "grad_norm": 0.015109594911336899, "learning_rate": 3.174603174603175e-07, "loss": 0.0019, "num_tokens": 163259364.0, "reward": 0.9322916865348816, "reward_std": 0.034481633454561234, "rewards/accuracy_reward": 0.8717448115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3903.0, "completions/mean_length": 738.8314208984375, "completions/mean_terminated_length": 714.61572265625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.4928, "grad_norm": 0.01741495169699192, "learning_rate": 2.3809523809523811e-07, "loss": 0.0019, "num_tokens": 164639529.0, "reward": 0.9130859375, "reward_std": 0.040897756814956665, "rewards/accuracy_reward": 0.8333333134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 806.326171875, "completions/mean_terminated_length": 786.9371337890625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.4992, "grad_norm": 0.018980255350470543, "learning_rate": 1.5873015873015874e-07, "loss": 0.0009, "num_tokens": 166128470.0, "reward": 0.8951823115348816, "reward_std": 0.04883330315351486, "rewards/accuracy_reward": 0.7962239384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 797.5814208984375, "completions/mean_terminated_length": 758.4697265625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.5056, "grad_norm": 0.018569298088550568, "learning_rate": 7.936507936507937e-08, "loss": 0.0027, "num_tokens": 167603235.0, "reward": 0.900390625, "reward_std": 0.04614834114909172, "rewards/accuracy_reward": 0.8125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.0, "step": 79 }, { "epoch": 0.5056, "step": 79, "total_flos": 0.0, "train_loss": 0.003337171362695296, "train_runtime": 71057.8488, "train_samples_per_second": 0.211, "train_steps_per_second": 0.001 } ], "logging_steps": 1, "max_steps": 79, "num_input_tokens_seen": 167603235, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }