Files
qwen3-8B-rlvr_g8_b384_math/trainer_state.json
ModelHub XC 3172051fd3 初始化项目,由ModelHub XC社区提供模型
Model: gguk2on/qwen3-8B-rlvr_g8_b384_math
Source: Original Platform
2026-05-30 04:56:18 +08:00

2292 lines
83 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5056,
"eval_steps": 15,
"global_step": 79,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.36458333333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4090.0,
"completions/mean_length": 2780.46484375,
"completions/mean_terminated_length": 2025.6495361328125,
"completions/min_length": 263.0,
"completions/min_terminated_length": 263.0,
"epoch": 0.0064,
"grad_norm": 0.03387239947915077,
"learning_rate": 0.0,
"loss": 0.0002,
"num_tokens": 4520194.0,
"reward": 0.0608723983168602,
"reward_std": 0.09049739688634872,
"rewards/accuracy_reward": 0.0494791679084301,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.072265625,
"rewards/mean_confidence_reward": 0.0,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.33463541666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4089.0,
"completions/mean_length": 2726.617919921875,
"completions/mean_terminated_length": 2037.906982421875,
"completions/min_length": 413.0,
"completions/min_terminated_length": 413.0,
"epoch": 0.0128,
"grad_norm": 0.029681719839572906,
"learning_rate": 3.125e-07,
"loss": 0.0038,
"num_tokens": 8959671.0,
"reward": 0.0703125,
"reward_std": 0.09117179363965988,
"rewards/accuracy_reward": 0.064453125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.076171875,
"rewards/mean_confidence_reward": 0.0,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4088.0,
"completions/mean_length": 2753.60546875,
"completions/mean_terminated_length": 2050.446533203125,
"completions/min_length": 321.0,
"completions/min_terminated_length": 321.0,
"epoch": 0.0192,
"grad_norm": 0.040974151343107224,
"learning_rate": 6.25e-07,
"loss": -0.0006,
"num_tokens": 13444641.0,
"reward": 0.0826822966337204,
"reward_std": 0.12302601337432861,
"rewards/accuracy_reward": 0.0748697891831398,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.0904947891831398,
"rewards/mean_confidence_reward": 0.0,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.32682291666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4074.0,
"completions/mean_length": 2688.072265625,
"completions/mean_terminated_length": 2004.532958984375,
"completions/min_length": 326.0,
"completions/min_terminated_length": 326.0,
"epoch": 0.0256,
"grad_norm": 0.03401859849691391,
"learning_rate": 9.375000000000001e-07,
"loss": 0.0047,
"num_tokens": 17810840.0,
"reward": 0.0963541716337204,
"reward_std": 0.11852055788040161,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.109375,
"rewards/mean_confidence_reward": 0.0,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3828125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4095.0,
"completions/mean_length": 2876.296875,
"completions/mean_terminated_length": 2119.77197265625,
"completions/min_length": 464.0,
"completions/min_terminated_length": 464.0,
"epoch": 0.032,
"grad_norm": 0.033929161727428436,
"learning_rate": 1.25e-06,
"loss": 0.0041,
"num_tokens": 22478032.0,
"reward": 0.1051432341337204,
"reward_std": 0.1347123086452484,
"rewards/accuracy_reward": 0.095703125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.1145833358168602,
"rewards/mean_confidence_reward": 0.0,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.37825520833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4096.0,
"completions/mean_length": 2795.106201171875,
"completions/mean_terminated_length": 2003.6722412109375,
"completions/min_length": 288.0,
"completions/min_terminated_length": 288.0,
"epoch": 0.0384,
"grad_norm": 0.04229738935828209,
"learning_rate": 1.5625e-06,
"loss": 0.0039,
"num_tokens": 27022347.0,
"reward": 0.2288411557674408,
"reward_std": 0.17552155256271362,
"rewards/accuracy_reward": 0.2174479216337204,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.240234375,
"rewards/mean_confidence_reward": 0.0,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.36588541666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4088.0,
"completions/mean_length": 2775.00537109375,
"completions/mean_terminated_length": 2012.78857421875,
"completions/min_length": 315.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.0448,
"grad_norm": 0.03459632769227028,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.0124,
"num_tokens": 31537667.0,
"reward": 0.3424479365348816,
"reward_std": 0.17551496624946594,
"rewards/accuracy_reward": 0.2955729067325592,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.3893229067325592,
"rewards/mean_confidence_reward": 0.0,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34700520833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4087.0,
"completions/mean_length": 2669.205810546875,
"completions/mean_terminated_length": 1910.9990234375,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"epoch": 0.0512,
"grad_norm": 0.03504309430718422,
"learning_rate": 2.1875000000000002e-06,
"loss": 0.0106,
"num_tokens": 35883055.0,
"reward": 0.4195963740348816,
"reward_std": 0.16831031441688538,
"rewards/accuracy_reward": 0.37109375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.4680989682674408,
"rewards/mean_confidence_reward": 0.0,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.27408854166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4096.0,
"completions/mean_length": 2454.344482421875,
"completions/mean_terminated_length": 1834.490478515625,
"completions/min_length": 277.0,
"completions/min_terminated_length": 277.0,
"epoch": 0.0576,
"grad_norm": 0.1767028123140335,
"learning_rate": 2.5e-06,
"loss": 0.0059,
"num_tokens": 39906592.0,
"reward": 0.5374349355697632,
"reward_std": 0.17072224617004395,
"rewards/accuracy_reward": 0.50390625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.5709635615348816,
"rewards/mean_confidence_reward": 0.0,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.32877604166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4093.0,
"completions/mean_length": 2579.9296875,
"completions/mean_terminated_length": 1837.334716796875,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.064,
"grad_norm": 0.030384620651602745,
"learning_rate": 2.8125e-06,
"loss": 0.0096,
"num_tokens": 44126708.0,
"reward": 0.5865885615348816,
"reward_std": 0.14248789846897125,
"rewards/accuracy_reward": 0.5494791865348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.6236979365348816,
"rewards/mean_confidence_reward": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28971354166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4085.0,
"completions/mean_length": 2423.814453125,
"completions/mean_terminated_length": 1741.7589111328125,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.0704,
"grad_norm": 0.023964334279298782,
"learning_rate": 3.125e-06,
"loss": 0.0076,
"num_tokens": 48099087.0,
"reward": 0.6575521230697632,
"reward_std": 0.11561574786901474,
"rewards/accuracy_reward": 0.6139323115348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.701171875,
"rewards/mean_confidence_reward": 0.0,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.26627604166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4084.0,
"completions/mean_length": 2389.55615234375,
"completions/mean_terminated_length": 1770.269775390625,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.0768,
"grad_norm": 0.02120368555188179,
"learning_rate": 3.4375e-06,
"loss": 0.0055,
"num_tokens": 52013309.0,
"reward": 0.669921875,
"reward_std": 0.10116906464099884,
"rewards/accuracy_reward": 0.6080729365348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.7317708134651184,
"rewards/mean_confidence_reward": 0.0,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.20703125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4092.0,
"completions/mean_length": 1986.2142333984375,
"completions/mean_terminated_length": 1435.3834228515625,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.0832,
"grad_norm": 0.030623242259025574,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0032,
"num_tokens": 55310054.0,
"reward": 0.72265625,
"reward_std": 0.10442712903022766,
"rewards/accuracy_reward": 0.6575520634651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.7877604365348816,
"rewards/mean_confidence_reward": 0.0,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.154296875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4095.0,
"completions/mean_length": 1930.3646240234375,
"completions/mean_terminated_length": 1535.2486572265625,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.0896,
"grad_norm": 0.022816797718405724,
"learning_rate": 4.0625000000000005e-06,
"loss": 0.0094,
"num_tokens": 58523078.0,
"reward": 0.7763671875,
"reward_std": 0.08641418814659119,
"rewards/accuracy_reward": 0.708984375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.84375,
"rewards/mean_confidence_reward": 0.0,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.19791666666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4087.0,
"completions/mean_length": 2009.919921875,
"completions/mean_terminated_length": 1495.1728515625,
"completions/min_length": 283.0,
"completions/min_terminated_length": 283.0,
"epoch": 0.096,
"grad_norm": 0.02108619175851345,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.0077,
"num_tokens": 61865667.0,
"reward": 0.748046875,
"reward_std": 0.0890018567442894,
"rewards/accuracy_reward": 0.6946614384651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.8014323115348816,
"rewards/mean_confidence_reward": 0.0,
"step": 15
},
{
"epoch": 0.096,
"eval_completions/clipped_ratio": 0.1730018028846154,
"eval_completions/max_length": 4096.0,
"eval_completions/max_terminated_length": 3884.25,
"eval_completions/mean_length": 1958.4808502197266,
"eval_completions/mean_terminated_length": 1510.7203216552734,
"eval_completions/min_length": 328.0,
"eval_completions/min_terminated_length": 328.0,
"eval_loss": 0.0,
"eval_num_tokens": 61865667.0,
"eval_reward": 0.7666015625,
"eval_reward_std": 0.36640918254852295,
"eval_rewards/accuracy_reward": 0.70703125,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.826171875,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 784.7448,
"eval_samples_per_second": 1.274,
"eval_steps_per_second": 0.01,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.18294270833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4093.0,
"completions/mean_length": 1902.54296875,
"completions/mean_terminated_length": 1411.4183349609375,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"epoch": 0.1024,
"grad_norm": 0.02185441181063652,
"learning_rate": 4.6875000000000004e-06,
"loss": 0.0082,
"num_tokens": 65043269.0,
"reward": 0.7708333730697632,
"reward_std": 0.08093124628067017,
"rewards/accuracy_reward": 0.7272135615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.814453125,
"rewards/mean_confidence_reward": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21484375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4096.0,
"completions/mean_length": 2011.791015625,
"completions/mean_terminated_length": 1441.485107421875,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.1088,
"grad_norm": 0.01952357590198517,
"learning_rate": 5e-06,
"loss": 0.0102,
"num_tokens": 68385996.0,
"reward": 0.7210286855697632,
"reward_std": 0.07699891179800034,
"rewards/accuracy_reward": 0.6569010615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.78515625,
"rewards/mean_confidence_reward": 0.0,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.14778645833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4086.0,
"completions/mean_length": 1735.443359375,
"completions/mean_terminated_length": 1326.087890625,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.1152,
"grad_norm": 0.01978280581533909,
"learning_rate": 4.920634920634921e-06,
"loss": 0.0059,
"num_tokens": 71295525.0,
"reward": 0.7796224355697632,
"reward_std": 0.07605080306529999,
"rewards/accuracy_reward": 0.70703125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.8522135615348816,
"rewards/mean_confidence_reward": 0.0,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.150390625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4095.0,
"completions/mean_length": 1611.516357421875,
"completions/mean_terminated_length": 1171.734130859375,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.1216,
"grad_norm": 0.02145254611968994,
"learning_rate": 4.841269841269842e-06,
"loss": 0.0045,
"num_tokens": 74020350.0,
"reward": 0.7809244990348816,
"reward_std": 0.06806011497974396,
"rewards/accuracy_reward": 0.712890625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.8489583134651184,
"rewards/mean_confidence_reward": 0.0,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.13802083333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4082.0,
"completions/mean_length": 1633.2982177734375,
"completions/mean_terminated_length": 1238.96826171875,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.128,
"grad_norm": 0.020902708172798157,
"learning_rate": 4.761904761904762e-06,
"loss": 0.0062,
"num_tokens": 76775704.0,
"reward": 0.7747396230697632,
"reward_std": 0.07498719543218613,
"rewards/accuracy_reward": 0.6875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.8619791865348816,
"rewards/mean_confidence_reward": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.10872395833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4005.0,
"completions/mean_length": 1514.591796875,
"completions/mean_terminated_length": 1199.6939697265625,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"epoch": 0.1344,
"grad_norm": 0.018049517646431923,
"learning_rate": 4.682539682539683e-06,
"loss": 0.0044,
"num_tokens": 79351357.0,
"reward": 0.8141276240348816,
"reward_std": 0.06124332174658775,
"rewards/accuracy_reward": 0.7369791865348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.8912760615348816,
"rewards/mean_confidence_reward": 0.0,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09895833333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4038.0,
"completions/mean_length": 1392.6966552734375,
"completions/mean_terminated_length": 1095.802001953125,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.1408,
"grad_norm": 0.01976490393280983,
"learning_rate": 4.603174603174604e-06,
"loss": 0.0073,
"num_tokens": 81734811.0,
"reward": 0.8177083730697632,
"reward_std": 0.06400330364704132,
"rewards/accuracy_reward": 0.734375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9010416865348816,
"rewards/mean_confidence_reward": 0.0,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.091796875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4019.0,
"completions/mean_length": 1432.442138671875,
"completions/mean_terminated_length": 1163.22216796875,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.1472,
"grad_norm": 0.01790717802941799,
"learning_rate": 4.523809523809524e-06,
"loss": 0.0055,
"num_tokens": 84190490.0,
"reward": 0.8294271230697632,
"reward_std": 0.05695289000868797,
"rewards/accuracy_reward": 0.751953125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9069010615348816,
"rewards/mean_confidence_reward": 0.0,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.060546875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4071.0,
"completions/mean_length": 1136.582763671875,
"completions/mean_terminated_length": 945.8510131835938,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1536,
"grad_norm": 0.019688351079821587,
"learning_rate": 4.444444444444444e-06,
"loss": 0.0073,
"num_tokens": 86182577.0,
"reward": 0.8580729365348816,
"reward_std": 0.060081034898757935,
"rewards/accuracy_reward": 0.7766926884651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.939453125,
"rewards/mean_confidence_reward": 0.0,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4088.0,
"completions/mean_length": 1181.623046875,
"completions/mean_terminated_length": 961.2080078125,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.16,
"grad_norm": 0.01960933580994606,
"learning_rate": 4.365079365079366e-06,
"loss": 0.0064,
"num_tokens": 88240142.0,
"reward": 0.8430989980697632,
"reward_std": 0.06078281253576279,
"rewards/accuracy_reward": 0.7565104365348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9296875,
"rewards/mean_confidence_reward": 0.0,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05924479166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4085.0,
"completions/mean_length": 1170.4818115234375,
"completions/mean_terminated_length": 986.2449951171875,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.1664,
"grad_norm": 0.019044535234570503,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.0052,
"num_tokens": 90288618.0,
"reward": 0.8356119990348816,
"reward_std": 0.06481396406888962,
"rewards/accuracy_reward": 0.73046875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9407551884651184,
"rewards/mean_confidence_reward": 0.0,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4095.0,
"completions/mean_length": 848.7389526367188,
"completions/mean_terminated_length": 790.6369018554688,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1728,
"grad_norm": 0.02053948864340782,
"learning_rate": 4.206349206349207e-06,
"loss": 0.002,
"num_tokens": 91833401.0,
"reward": 0.8896484375,
"reward_std": 0.04453360289335251,
"rewards/accuracy_reward": 0.796875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.982421875,
"rewards/mean_confidence_reward": 0.0,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4080.0,
"completions/mean_length": 1033.705810546875,
"completions/mean_terminated_length": 909.2222290039062,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.1792,
"grad_norm": 0.01965363323688507,
"learning_rate": 4.126984126984127e-06,
"loss": 0.0043,
"num_tokens": 93668645.0,
"reward": 0.8759765625,
"reward_std": 0.06271578371524811,
"rewards/accuracy_reward": 0.791015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9609375,
"rewards/mean_confidence_reward": 0.0,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03190104166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4064.0,
"completions/mean_length": 944.6478271484375,
"completions/mean_terminated_length": 840.8035888671875,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.1856,
"grad_norm": 0.017758071422576904,
"learning_rate": 4.047619047619048e-06,
"loss": 0.0019,
"num_tokens": 95360064.0,
"reward": 0.8597005605697632,
"reward_std": 0.048148658126592636,
"rewards/accuracy_reward": 0.7513020634651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9680989384651184,
"rewards/mean_confidence_reward": 0.0,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02799479166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4035.0,
"completions/mean_length": 963.6784057617188,
"completions/mean_terminated_length": 873.4641723632812,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.192,
"grad_norm": 0.020760418847203255,
"learning_rate": 3.968253968253968e-06,
"loss": 0.0067,
"num_tokens": 97084650.0,
"reward": 0.8678385615348816,
"reward_std": 0.05553457885980606,
"rewards/accuracy_reward": 0.763671875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9720051884651184,
"rewards/mean_confidence_reward": 0.0,
"step": 30
},
{
"epoch": 0.192,
"eval_completions/clipped_ratio": 0.021709735576923073,
"eval_completions/max_length": 4096.0,
"eval_completions/max_terminated_length": 3616.375,
"eval_completions/mean_length": 845.1283874511719,
"eval_completions/mean_terminated_length": 772.5844650268555,
"eval_completions/min_length": 154.25,
"eval_completions/min_terminated_length": 154.25,
"eval_loss": 0.0,
"eval_num_tokens": 97084650.0,
"eval_reward": 0.87646484375,
"eval_reward_std": 0.21883795596659184,
"eval_rewards/accuracy_reward": 0.7744140625,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.978515625,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 659.9975,
"eval_samples_per_second": 1.515,
"eval_steps_per_second": 0.012,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02083333333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4040.0,
"completions/mean_length": 878.7572021484375,
"completions/mean_terminated_length": 810.30517578125,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.1984,
"grad_norm": 0.020707620307803154,
"learning_rate": 3.88888888888889e-06,
"loss": 0.003,
"num_tokens": 98685229.0,
"reward": 0.8831380605697632,
"reward_std": 0.05947504937648773,
"rewards/accuracy_reward": 0.7884114384651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9778645634651184,
"rewards/mean_confidence_reward": 0.0,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4014.0,
"completions/mean_length": 817.9010620117188,
"completions/mean_terminated_length": 792.0892333984375,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.2048,
"grad_norm": 0.01970040611922741,
"learning_rate": 3.80952380952381e-06,
"loss": 0.0001,
"num_tokens": 100184125.0,
"reward": 0.8935546875,
"reward_std": 0.04857983812689781,
"rewards/accuracy_reward": 0.794921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4048.0,
"completions/mean_length": 802.2428588867188,
"completions/mean_terminated_length": 749.9609375,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.2112,
"grad_norm": 0.02303779311478138,
"learning_rate": 3.7301587301587305e-06,
"loss": 0.0039,
"num_tokens": 101661458.0,
"reward": 0.8557943105697632,
"reward_std": 0.06328541040420532,
"rewards/accuracy_reward": 0.7272135615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4063.0,
"completions/mean_length": 787.0625,
"completions/mean_terminated_length": 734.5396728515625,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.2176,
"grad_norm": 0.018765997141599655,
"learning_rate": 3.6507936507936507e-06,
"loss": 0.0032,
"num_tokens": 103115794.0,
"reward": 0.8919271230697632,
"reward_std": 0.043571051210165024,
"rewards/accuracy_reward": 0.7994791865348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4031.0,
"completions/mean_length": 764.9583740234375,
"completions/mean_terminated_length": 732.1078491210938,
"completions/min_length": 102.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.224,
"grad_norm": 0.02199125476181507,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.001,
"num_tokens": 104538930.0,
"reward": 0.8860677480697632,
"reward_std": 0.05475646257400513,
"rewards/accuracy_reward": 0.7819010615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00651041666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3941.0,
"completions/mean_length": 731.091796875,
"completions/mean_terminated_length": 709.0413208007812,
"completions/min_length": 132.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.2304,
"grad_norm": 0.019916215911507607,
"learning_rate": 3.492063492063492e-06,
"loss": 0.0015,
"num_tokens": 105911879.0,
"reward": 0.8876953125,
"reward_std": 0.04180898517370224,
"rewards/accuracy_reward": 0.7819010615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.0,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01106770833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3785.0,
"completions/mean_length": 782.7923583984375,
"completions/mean_terminated_length": 745.7122802734375,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.2368,
"grad_norm": 0.021914253011345863,
"learning_rate": 3.412698412698413e-06,
"loss": 0.0017,
"num_tokens": 107360480.0,
"reward": 0.88671875,
"reward_std": 0.053066499531269073,
"rewards/accuracy_reward": 0.7845051884651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9889323115348816,
"rewards/mean_confidence_reward": 0.0,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3992.0,
"completions/mean_length": 767.8822021484375,
"completions/mean_terminated_length": 741.676513671875,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.2432,
"grad_norm": 0.023255277425050735,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0007,
"num_tokens": 108788163.0,
"reward": 0.8880208730697632,
"reward_std": 0.05833045765757561,
"rewards/accuracy_reward": 0.7838541865348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01888020833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4027.0,
"completions/mean_length": 821.6953125,
"completions/mean_terminated_length": 758.6861572265625,
"completions/min_length": 148.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.2496,
"grad_norm": 0.02124691754579544,
"learning_rate": 3.2539682539682544e-06,
"loss": 0.0031,
"num_tokens": 110295215.0,
"reward": 0.876953125,
"reward_std": 0.056418370455503464,
"rewards/accuracy_reward": 0.7727864384651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9811198115348816,
"rewards/mean_confidence_reward": 0.0,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4080.0,
"completions/mean_length": 746.0670776367188,
"completions/mean_terminated_length": 713.0302734375,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.256,
"grad_norm": 0.021513385698199272,
"learning_rate": 3.1746031746031746e-06,
"loss": 0.0034,
"num_tokens": 111691110.0,
"reward": 0.8782552480697632,
"reward_std": 0.05017038434743881,
"rewards/accuracy_reward": 0.7662760615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01106770833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4080.0,
"completions/mean_length": 741.466796875,
"completions/mean_terminated_length": 703.9242553710938,
"completions/min_length": 135.0,
"completions/min_terminated_length": 135.0,
"epoch": 0.2624,
"grad_norm": 0.019013214856386185,
"learning_rate": 3.0952380952380957e-06,
"loss": 0.0013,
"num_tokens": 113077179.0,
"reward": 0.8899739980697632,
"reward_std": 0.038621604442596436,
"rewards/accuracy_reward": 0.791015625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9889323115348816,
"rewards/mean_confidence_reward": 0.0,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3970.0,
"completions/mean_length": 693.7057495117188,
"completions/mean_terminated_length": 660.152587890625,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.2688,
"grad_norm": 0.022837555035948753,
"learning_rate": 3.015873015873016e-06,
"loss": 0.0005,
"num_tokens": 114389551.0,
"reward": 0.8785807490348816,
"reward_std": 0.045825596898794174,
"rewards/accuracy_reward": 0.7669270634651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3969.0,
"completions/mean_length": 813.830078125,
"completions/mean_terminated_length": 761.7321166992188,
"completions/min_length": 100.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.2752,
"grad_norm": 0.020699184387922287,
"learning_rate": 2.936507936507937e-06,
"loss": 0.0034,
"num_tokens": 115888346.0,
"reward": 0.8785807490348816,
"reward_std": 0.050285469740629196,
"rewards/accuracy_reward": 0.7727864384651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00846354166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3888.0,
"completions/mean_length": 724.4251708984375,
"completions/mean_terminated_length": 695.6461181640625,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.2816,
"grad_norm": 0.024175629019737244,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.0019,
"num_tokens": 117249167.0,
"reward": 0.9088541865348816,
"reward_std": 0.054964929819107056,
"rewards/accuracy_reward": 0.826171875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9915364384651184,
"rewards/mean_confidence_reward": 0.0,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01106770833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4071.0,
"completions/mean_length": 746.92578125,
"completions/mean_terminated_length": 709.4443359375,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.288,
"grad_norm": 0.021540403366088867,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0011,
"num_tokens": 118642173.0,
"reward": 0.8756510615348816,
"reward_std": 0.05108930170536041,
"rewards/accuracy_reward": 0.7623698115348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9889323115348816,
"rewards/mean_confidence_reward": 0.0,
"step": 45
},
{
"epoch": 0.288,
"eval_completions/clipped_ratio": 0.005859375,
"eval_completions/max_length": 3775.5,
"eval_completions/max_terminated_length": 3543.375,
"eval_completions/mean_length": 740.3269271850586,
"eval_completions/mean_terminated_length": 720.5047912597656,
"eval_completions/min_length": 186.875,
"eval_completions/min_terminated_length": 186.875,
"eval_loss": 0.0,
"eval_num_tokens": 118642173.0,
"eval_reward": 0.88916015625,
"eval_reward_std": 0.19981716014444828,
"eval_rewards/accuracy_reward": 0.7841796875,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.994140625,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 603.9532,
"eval_samples_per_second": 1.656,
"eval_steps_per_second": 0.013,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4011.0,
"completions/mean_length": 716.748046875,
"completions/mean_terminated_length": 696.8310546875,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.2944,
"grad_norm": 0.022779908031225204,
"learning_rate": 2.6984126984126986e-06,
"loss": 0.0004,
"num_tokens": 119996194.0,
"reward": 0.9026693105697632,
"reward_std": 0.05144798010587692,
"rewards/accuracy_reward": 0.8111979365348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.994140625,
"rewards/mean_confidence_reward": 0.0,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00716145833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4062.0,
"completions/mean_length": 770.3229370117188,
"completions/mean_terminated_length": 746.3344116210938,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.3008,
"grad_norm": 0.01919720135629177,
"learning_rate": 2.6190476190476192e-06,
"loss": 0.001,
"num_tokens": 121428754.0,
"reward": 0.8958333730697632,
"reward_std": 0.04597648233175278,
"rewards/accuracy_reward": 0.798828125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9928385615348816,
"rewards/mean_confidence_reward": 0.0,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01822916666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4030.0,
"completions/mean_length": 835.46875,
"completions/mean_terminated_length": 774.9283447265625,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.3072,
"grad_norm": 0.02324065938591957,
"learning_rate": 2.53968253968254e-06,
"loss": 0.0024,
"num_tokens": 122966546.0,
"reward": 0.857421875,
"reward_std": 0.062466781586408615,
"rewards/accuracy_reward": 0.7330729365348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9817708134651184,
"rewards/mean_confidence_reward": 0.0,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01627604166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4094.0,
"completions/mean_length": 792.6647338867188,
"completions/mean_terminated_length": 738.0099487304688,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.3136,
"grad_norm": 0.022559164091944695,
"learning_rate": 2.4603174603174605e-06,
"loss": 0.0016,
"num_tokens": 124435511.0,
"reward": 0.890625,
"reward_std": 0.05695483088493347,
"rewards/accuracy_reward": 0.7975260615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9837239384651184,
"rewards/mean_confidence_reward": 0.0,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3965.0,
"completions/mean_length": 756.9381713867188,
"completions/mean_terminated_length": 717.3445434570312,
"completions/min_length": 113.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.32,
"grad_norm": 0.023683471605181694,
"learning_rate": 2.380952380952381e-06,
"loss": 0.0034,
"num_tokens": 125844928.0,
"reward": 0.8766276240348816,
"reward_std": 0.0529298409819603,
"rewards/accuracy_reward": 0.7649739384651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00911458333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3821.0,
"completions/mean_length": 673.533203125,
"completions/mean_terminated_length": 642.0518798828125,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.3264,
"grad_norm": 0.0213849525898695,
"learning_rate": 2.301587301587302e-06,
"loss": 0.002,
"num_tokens": 127124315.0,
"reward": 0.8948568105697632,
"reward_std": 0.04318075627088547,
"rewards/accuracy_reward": 0.798828125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9908854365348816,
"rewards/mean_confidence_reward": 0.0,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4096.0,
"completions/mean_length": 713.7291870117188,
"completions/mean_terminated_length": 693.7943725585938,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.3328,
"grad_norm": 0.02206575684249401,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0026,
"num_tokens": 128472651.0,
"reward": 0.8961588740348816,
"reward_std": 0.0437723845243454,
"rewards/accuracy_reward": 0.7981770634651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.994140625,
"rewards/mean_confidence_reward": 0.0,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00455729166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3920.0,
"completions/mean_length": 780.7955932617188,
"completions/mean_terminated_length": 765.6180419921875,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.3392,
"grad_norm": 0.017915785312652588,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.0014,
"num_tokens": 129916081.0,
"reward": 0.9124349355697632,
"reward_std": 0.04158523678779602,
"rewards/accuracy_reward": 0.8294270634651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9954426884651184,
"rewards/mean_confidence_reward": 0.0,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00651041666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3977.0,
"completions/mean_length": 673.7780151367188,
"completions/mean_terminated_length": 651.3519287109375,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.3456,
"grad_norm": 0.020742084830999374,
"learning_rate": 2.0634920634920634e-06,
"loss": 0.0008,
"num_tokens": 131197388.0,
"reward": 0.8958333730697632,
"reward_std": 0.04164648801088333,
"rewards/accuracy_reward": 0.7981770634651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.0,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4034.0,
"completions/mean_length": 732.0514526367188,
"completions/mean_terminated_length": 705.5636596679688,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.352,
"grad_norm": 0.019956564530730247,
"learning_rate": 1.984126984126984e-06,
"loss": 0.001,
"num_tokens": 132571659.0,
"reward": 0.8857421875,
"reward_std": 0.045707326382398605,
"rewards/accuracy_reward": 0.779296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9921875,
"rewards/mean_confidence_reward": 0.0,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00520833333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4006.0,
"completions/mean_length": 767.5592651367188,
"completions/mean_terminated_length": 750.1328735351562,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.3584,
"grad_norm": 0.01656208373606205,
"learning_rate": 1.904761904761905e-06,
"loss": 0.0026,
"num_tokens": 134000862.0,
"reward": 0.9020182490348816,
"reward_std": 0.038941457867622375,
"rewards/accuracy_reward": 0.8092448115348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9947916865348816,
"rewards/mean_confidence_reward": 0.0,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0026041666666666297,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3850.0,
"completions/mean_length": 661.19921875,
"completions/mean_terminated_length": 652.2310791015625,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"epoch": 0.3648,
"grad_norm": 0.018361348658800125,
"learning_rate": 1.8253968253968254e-06,
"loss": 0.0015,
"num_tokens": 135264776.0,
"reward": 0.9013671875,
"reward_std": 0.03878443315625191,
"rewards/accuracy_reward": 0.8053385615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9973958134651184,
"rewards/mean_confidence_reward": 0.0,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01302083333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4088.0,
"completions/mean_length": 766.0013427734375,
"completions/mean_terminated_length": 722.0699462890625,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.3712,
"grad_norm": 0.020110923796892166,
"learning_rate": 1.746031746031746e-06,
"loss": 0.0031,
"num_tokens": 136692626.0,
"reward": 0.9104818105697632,
"reward_std": 0.05508416146039963,
"rewards/accuracy_reward": 0.833984375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9869791865348816,
"rewards/mean_confidence_reward": 0.0,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3904.0,
"completions/mean_length": 759.0625,
"completions/mean_terminated_length": 739.3948974609375,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.3776,
"grad_norm": 0.02654006890952587,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0013,
"num_tokens": 138104586.0,
"reward": 0.8876953125,
"reward_std": 0.044509004801511765,
"rewards/accuracy_reward": 0.78125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.994140625,
"rewards/mean_confidence_reward": 0.0,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00911458333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3975.0,
"completions/mean_length": 840.5924682617188,
"completions/mean_terminated_length": 810.6478271484375,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.384,
"grad_norm": 0.021109433844685555,
"learning_rate": 1.5873015873015873e-06,
"loss": 0.0025,
"num_tokens": 139644632.0,
"reward": 0.8779296875,
"reward_std": 0.05428009480237961,
"rewards/accuracy_reward": 0.7649739384651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9908854365348816,
"rewards/mean_confidence_reward": 0.0,
"step": 60
},
{
"epoch": 0.384,
"eval_completions/clipped_ratio": 0.005859375,
"eval_completions/max_length": 3920.5,
"eval_completions/max_terminated_length": 3464.5,
"eval_completions/mean_length": 786.6544494628906,
"eval_completions/mean_terminated_length": 767.0381546020508,
"eval_completions/min_length": 187.375,
"eval_completions/min_terminated_length": 187.375,
"eval_loss": 0.0,
"eval_num_tokens": 139644632.0,
"eval_reward": 0.884765625,
"eval_reward_std": 0.2027157824486494,
"eval_rewards/accuracy_reward": 0.775390625,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.994140625,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 622.9608,
"eval_samples_per_second": 1.605,
"eval_steps_per_second": 0.013,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01236979166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4059.0,
"completions/mean_length": 900.9095458984375,
"completions/mean_terminated_length": 860.891845703125,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.3904,
"grad_norm": 0.019216015934944153,
"learning_rate": 1.507936507936508e-06,
"loss": 0.0014,
"num_tokens": 141286341.0,
"reward": 0.861328125,
"reward_std": 0.05327238887548447,
"rewards/accuracy_reward": 0.7350260615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9876301884651184,
"rewards/mean_confidence_reward": 0.0,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.017578125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4066.0,
"completions/mean_length": 839.0540771484375,
"completions/mean_terminated_length": 780.7786865234375,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.3968,
"grad_norm": 0.021654291078448296,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.0025,
"num_tokens": 142824816.0,
"reward": 0.8942057490348816,
"reward_std": 0.06182871386408806,
"rewards/accuracy_reward": 0.8059895634651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.982421875,
"rewards/mean_confidence_reward": 0.0,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009765625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4030.0,
"completions/mean_length": 764.9212646484375,
"completions/mean_terminated_length": 732.0703735351562,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"epoch": 0.4032,
"grad_norm": 0.018562061712145805,
"learning_rate": 1.3492063492063493e-06,
"loss": 0.0021,
"num_tokens": 144247383.0,
"reward": 0.8984375,
"reward_std": 0.041560571640729904,
"rewards/accuracy_reward": 0.806640625,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.990234375,
"rewards/mean_confidence_reward": 0.0,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00651041666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4011.0,
"completions/mean_length": 841.3509521484375,
"completions/mean_terminated_length": 820.02294921875,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.4096,
"grad_norm": 0.018766365945339203,
"learning_rate": 1.26984126984127e-06,
"loss": 0.0012,
"num_tokens": 145786162.0,
"reward": 0.9117838740348816,
"reward_std": 0.035551950335502625,
"rewards/accuracy_reward": 0.830078125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.0,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3912.0,
"completions/mean_length": 796.5963745117188,
"completions/mean_terminated_length": 744.224853515625,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.416,
"grad_norm": 0.021488916128873825,
"learning_rate": 1.1904761904761906e-06,
"loss": 0.0036,
"num_tokens": 147262862.0,
"reward": 0.8873698115348816,
"reward_std": 0.04491402953863144,
"rewards/accuracy_reward": 0.7903645634651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.984375,
"rewards/mean_confidence_reward": 0.0,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0032552083333333703,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3934.0,
"completions/mean_length": 757.4036865234375,
"completions/mean_terminated_length": 746.5003051757812,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.4224,
"grad_norm": 0.014640610665082932,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0013,
"num_tokens": 148673874.0,
"reward": 0.9049479365348816,
"reward_std": 0.029130559414625168,
"rewards/accuracy_reward": 0.8131510615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9967448115348816,
"rewards/mean_confidence_reward": 0.0,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.001953125,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3851.0,
"completions/mean_length": 728.4642333984375,
"completions/mean_terminated_length": 721.8741455078125,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.4288,
"grad_norm": 0.017500903457403183,
"learning_rate": 1.0317460317460317e-06,
"loss": 0.0009,
"num_tokens": 150038443.0,
"reward": 0.9033203125,
"reward_std": 0.03719018027186394,
"rewards/accuracy_reward": 0.80859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.998046875,
"rewards/mean_confidence_reward": 0.0,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01497395833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4089.0,
"completions/mean_length": 797.9720458984375,
"completions/mean_terminated_length": 747.8367919921875,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.4352,
"grad_norm": 0.018346579745411873,
"learning_rate": 9.523809523809525e-07,
"loss": 0.0037,
"num_tokens": 151514616.0,
"reward": 0.880859375,
"reward_std": 0.048622481524944305,
"rewards/accuracy_reward": 0.7766926884651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9850260615348816,
"rewards/mean_confidence_reward": 0.0,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01627604166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3999.0,
"completions/mean_length": 867.1784057617188,
"completions/mean_terminated_length": 813.7564697265625,
"completions/min_length": 141.0,
"completions/min_terminated_length": 141.0,
"epoch": 0.4416,
"grad_norm": 0.019951898604631424,
"learning_rate": 8.73015873015873e-07,
"loss": -0.0004,
"num_tokens": 153096354.0,
"reward": 0.8893229365348816,
"reward_std": 0.053706176578998566,
"rewards/accuracy_reward": 0.794921875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9837239384651184,
"rewards/mean_confidence_reward": 0.0,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3932.0,
"completions/mean_length": 775.0885620117188,
"completions/mean_terminated_length": 755.515380859375,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.448,
"grad_norm": 0.019857991486787796,
"learning_rate": 7.936507936507937e-07,
"loss": 0.0015,
"num_tokens": 154536874.0,
"reward": 0.9117838740348816,
"reward_std": 0.050319306552410126,
"rewards/accuracy_reward": 0.8294270634651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.994140625,
"rewards/mean_confidence_reward": 0.0,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00716145833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3974.0,
"completions/mean_length": 759.080078125,
"completions/mean_terminated_length": 735.010498046875,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.4544,
"grad_norm": 0.017306774854660034,
"learning_rate": 7.142857142857143e-07,
"loss": 0.0006,
"num_tokens": 155947013.0,
"reward": 0.9140625,
"reward_std": 0.03777480125427246,
"rewards/accuracy_reward": 0.8352864384651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9928385615348816,
"rewards/mean_confidence_reward": 0.0,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00911458333333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3886.0,
"completions/mean_length": 777.9954833984375,
"completions/mean_terminated_length": 747.4750366210938,
"completions/min_length": 121.0,
"completions/min_terminated_length": 121.0,
"epoch": 0.4608,
"grad_norm": 0.01818043924868107,
"learning_rate": 6.34920634920635e-07,
"loss": 0.0022,
"num_tokens": 157392574.0,
"reward": 0.89453125,
"reward_std": 0.03996651619672775,
"rewards/accuracy_reward": 0.7981770634651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9908854365348816,
"rewards/mean_confidence_reward": 0.0,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01627604166666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4086.0,
"completions/mean_length": 819.3580932617188,
"completions/mean_terminated_length": 765.1449584960938,
"completions/min_length": 120.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.4672,
"grad_norm": 0.018845003098249435,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0013,
"num_tokens": 158892196.0,
"reward": 0.8815104365348816,
"reward_std": 0.04560422524809837,
"rewards/accuracy_reward": 0.779296875,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9837239384651184,
"rewards/mean_confidence_reward": 0.0,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4093.0,
"completions/mean_length": 759.9166870117188,
"completions/mean_terminated_length": 746.833984375,
"completions/min_length": 122.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.4736,
"grad_norm": 0.017951594665646553,
"learning_rate": 4.7619047619047623e-07,
"loss": 0.0021,
"num_tokens": 160304956.0,
"reward": 0.9007161855697632,
"reward_std": 0.03838188201189041,
"rewards/accuracy_reward": 0.8053385615348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.99609375,
"rewards/mean_confidence_reward": 0.0,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00651041666666663,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3974.0,
"completions/mean_length": 832.7454833984375,
"completions/mean_terminated_length": 811.361083984375,
"completions/min_length": 203.0,
"completions/min_terminated_length": 203.0,
"epoch": 0.48,
"grad_norm": 0.01818317361176014,
"learning_rate": 3.9682539682539683e-07,
"loss": 0.001,
"num_tokens": 161828517.0,
"reward": 0.8746744990348816,
"reward_std": 0.04868558794260025,
"rewards/accuracy_reward": 0.755859375,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9934895634651184,
"rewards/mean_confidence_reward": 0.0,
"step": 75
},
{
"epoch": 0.48,
"eval_completions/clipped_ratio": 0.0068359375,
"eval_completions/max_length": 3771.875,
"eval_completions/max_terminated_length": 3558.375,
"eval_completions/mean_length": 793.0995407104492,
"eval_completions/mean_terminated_length": 770.1827087402344,
"eval_completions/min_length": 188.125,
"eval_completions/min_terminated_length": 188.125,
"eval_loss": 0.0,
"eval_num_tokens": 161828517.0,
"eval_reward": 0.880859375,
"eval_reward_std": 0.20689411088824272,
"eval_rewards/accuracy_reward": 0.76953125,
"eval_rewards/brier_reward": 0.0,
"eval_rewards/confidence_one_or_zero": 0.0,
"eval_rewards/format_reward": 0.9921875,
"eval_rewards/mean_confidence_reward": 0.0,
"eval_runtime": 625.2922,
"eval_samples_per_second": 1.599,
"eval_steps_per_second": 0.013,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00716145833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4003.0,
"completions/mean_length": 772.556640625,
"completions/mean_terminated_length": 748.5842895507812,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.4864,
"grad_norm": 0.015109594911336899,
"learning_rate": 3.174603174603175e-07,
"loss": 0.0019,
"num_tokens": 163259364.0,
"reward": 0.9322916865348816,
"reward_std": 0.034481633454561234,
"rewards/accuracy_reward": 0.8717448115348816,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9928385615348816,
"rewards/mean_confidence_reward": 0.0,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00716145833333337,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 3903.0,
"completions/mean_length": 738.8314208984375,
"completions/mean_terminated_length": 714.61572265625,
"completions/min_length": 138.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.4928,
"grad_norm": 0.01741495169699192,
"learning_rate": 2.3809523809523811e-07,
"loss": 0.0019,
"num_tokens": 164639529.0,
"reward": 0.9130859375,
"reward_std": 0.040897756814956665,
"rewards/accuracy_reward": 0.8333333134651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.9928385615348816,
"rewards/mean_confidence_reward": 0.0,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005859375,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4073.0,
"completions/mean_length": 806.326171875,
"completions/mean_terminated_length": 786.9371337890625,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.4992,
"grad_norm": 0.018980255350470543,
"learning_rate": 1.5873015873015874e-07,
"loss": 0.0009,
"num_tokens": 166128470.0,
"reward": 0.8951823115348816,
"reward_std": 0.04883330315351486,
"rewards/accuracy_reward": 0.7962239384651184,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.994140625,
"rewards/mean_confidence_reward": 0.0,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 4096.0,
"completions/max_terminated_length": 4062.0,
"completions/mean_length": 797.5814208984375,
"completions/mean_terminated_length": 758.4697265625,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.5056,
"grad_norm": 0.018569298088550568,
"learning_rate": 7.936507936507937e-08,
"loss": 0.0027,
"num_tokens": 167603235.0,
"reward": 0.900390625,
"reward_std": 0.04614834114909172,
"rewards/accuracy_reward": 0.8125,
"rewards/brier_reward": 0.0,
"rewards/confidence_one_or_zero": 0.0,
"rewards/format_reward": 0.98828125,
"rewards/mean_confidence_reward": 0.0,
"step": 79
},
{
"epoch": 0.5056,
"step": 79,
"total_flos": 0.0,
"train_loss": 0.003337171362695296,
"train_runtime": 71057.8488,
"train_samples_per_second": 0.211,
"train_steps_per_second": 0.001
}
],
"logging_steps": 1,
"max_steps": 79,
"num_input_tokens_seen": 167603235,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}