{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5008, "eval_steps": 15, "global_step": 313, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 602.2611083984375, "completions/mean_terminated_length": 511.2418518066406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0016, "grad_norm": 0.003618580522015691, "learning_rate": 7.936507936507937e-08, "loss": 0.032, "num_tokens": 1338513.0, "reward": 0.6121081709861755, "reward_std": 0.5448371767997742, "rewards/accuracy_reward": 0.2734375, "rewards/brier_reward": 0.3270573616027832, "rewards/confidence_one_or_zero": 0.3430989682674408, "rewards/format_reward": 0.6236979365348816, "rewards/mean_confidence_reward": 0.8331964612007141, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01627604166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 576.8424682617188, "completions/mean_terminated_length": 518.6168212890625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.0032, "grad_norm": 0.0025654127821326256, "learning_rate": 1.5873015873015874e-07, "loss": 0.0101, "num_tokens": 2633247.0, "reward": 0.5621463060379028, "reward_std": 0.5485203266143799, "rewards/accuracy_reward": 0.2447916716337204, "rewards/brier_reward": 0.2967948615550995, "rewards/confidence_one_or_zero": 0.35546875, "rewards/format_reward": 0.5826823115348816, "rewards/mean_confidence_reward": 0.8323763012886047, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02473958333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3974.0, "completions/mean_length": 603.646484375, "completions/mean_terminated_length": 515.055419921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0048, "grad_norm": 0.0029752945993095636, "learning_rate": 2.3809523809523811e-07, "loss": 0.027, "num_tokens": 3978944.0, "reward": 0.5746791362762451, "reward_std": 0.5263984203338623, "rewards/accuracy_reward": 0.2513020932674408, "rewards/brier_reward": 0.30167821049690247, "rewards/confidence_one_or_zero": 0.3541666567325592, "rewards/format_reward": 0.5963541865348816, "rewards/mean_confidence_reward": 0.8375129699707031, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 636.4596557617188, "completions/mean_terminated_length": 524.861572265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0064, "grad_norm": 0.0022255138028413057, "learning_rate": 3.174603174603175e-07, "loss": 0.0183, "num_tokens": 5370914.0, "reward": 0.5558524131774902, "reward_std": 0.5339803099632263, "rewards/accuracy_reward": 0.2447916716337204, "rewards/brier_reward": 0.2972289025783539, "rewards/confidence_one_or_zero": 0.3255208432674408, "rewards/format_reward": 0.5696614384651184, "rewards/mean_confidence_reward": 0.81494140625, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02604166666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 637.2916870117188, "completions/mean_terminated_length": 544.8128662109375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.008, "grad_norm": 0.0026658286806195974, "learning_rate": 3.9682539682539683e-07, "loss": 0.0289, "num_tokens": 6767682.0, "reward": 0.59138023853302, "reward_std": 0.5628894567489624, "rewards/accuracy_reward": 0.2630208432674408, "rewards/brier_reward": 0.3175024092197418, "rewards/confidence_one_or_zero": 0.3444010317325592, "rewards/format_reward": 0.6022135615348816, "rewards/mean_confidence_reward": 0.837052047252655, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01822916666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3958.0, "completions/mean_length": 584.0338745117188, "completions/mean_terminated_length": 518.824951171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0096, "grad_norm": 0.0032022378873080015, "learning_rate": 4.7619047619047623e-07, "loss": 0.0105, "num_tokens": 8090390.0, "reward": 0.5799046754837036, "reward_std": 0.5018365383148193, "rewards/accuracy_reward": 0.2389322966337204, "rewards/brier_reward": 0.2965053617954254, "rewards/confidence_one_or_zero": 0.328125, "rewards/format_reward": 0.6243489384651184, "rewards/mean_confidence_reward": 0.8104459643363953, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3968.0, "completions/mean_length": 584.28515625, "completions/mean_terminated_length": 514.3306884765625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.0112, "grad_norm": 0.0026558355893939734, "learning_rate": 5.555555555555555e-07, "loss": 0.015, "num_tokens": 9398220.0, "reward": 0.5896402597427368, "reward_std": 0.5508158802986145, "rewards/accuracy_reward": 0.2682291567325592, "rewards/brier_reward": 0.31792858242988586, "rewards/confidence_one_or_zero": 0.3541666567325592, "rewards/format_reward": 0.5930989384651184, "rewards/mean_confidence_reward": 0.8344095349311829, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 579.3873901367188, "completions/mean_terminated_length": 502.1763000488281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0128, "grad_norm": 0.0031884699128568172, "learning_rate": 6.34920634920635e-07, "loss": 0.0251, "num_tokens": 10697247.0, "reward": 0.6372284293174744, "reward_std": 0.5331804156303406, "rewards/accuracy_reward": 0.2890625, "rewards/brier_reward": 0.3453974723815918, "rewards/confidence_one_or_zero": 0.3053385317325592, "rewards/format_reward": 0.6399739384651184, "rewards/mean_confidence_reward": 0.8413346409797668, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02018229166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4058.0, "completions/mean_length": 585.6875, "completions/mean_terminated_length": 513.3820190429688, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.0144, "grad_norm": 0.003014641348272562, "learning_rate": 7.142857142857143e-07, "loss": 0.0233, "num_tokens": 12004127.0, "reward": 0.6344260573387146, "reward_std": 0.5287589430809021, "rewards/accuracy_reward": 0.2734375, "rewards/brier_reward": 0.33132824301719666, "rewards/confidence_one_or_zero": 0.3483072817325592, "rewards/format_reward": 0.6640625, "rewards/mean_confidence_reward": 0.8443684577941895, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01497395833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3951.0, "completions/mean_length": 562.2421875, "completions/mean_terminated_length": 508.52349853515625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.016, "grad_norm": 0.01983969658613205, "learning_rate": 7.936507936507937e-07, "loss": 0.026, "num_tokens": 13289587.0, "reward": 0.7001076936721802, "reward_std": 0.5193542242050171, "rewards/accuracy_reward": 0.3001302182674408, "rewards/brier_reward": 0.36959290504455566, "rewards/confidence_one_or_zero": 0.314453125, "rewards/format_reward": 0.73046875, "rewards/mean_confidence_reward": 0.8572850823402405, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02669270833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4008.0, "completions/mean_length": 687.537109375, "completions/mean_terminated_length": 594.0608520507812, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0176, "grad_norm": 0.0028164065442979336, "learning_rate": 8.73015873015873e-07, "loss": 0.0337, "num_tokens": 14763916.0, "reward": 0.6904506683349609, "reward_std": 0.5314526557922363, "rewards/accuracy_reward": 0.3131510317325592, "rewards/brier_reward": 0.3685077726840973, "rewards/confidence_one_or_zero": 0.3600260317325592, "rewards/format_reward": 0.69921875, "rewards/mean_confidence_reward": 0.8279464244842529, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02213541666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3506.0, "completions/mean_length": 548.6419677734375, "completions/mean_terminated_length": 468.3421936035156, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.0192, "grad_norm": 0.0030605143401771784, "learning_rate": 9.523809523809525e-07, "loss": 0.0137, "num_tokens": 16038374.0, "reward": 0.6675162315368652, "reward_std": 0.46157020330429077, "rewards/accuracy_reward": 0.2513020932674408, "rewards/brier_reward": 0.32654550671577454, "rewards/confidence_one_or_zero": 0.3255208432674408, "rewards/format_reward": 0.7571614384651184, "rewards/mean_confidence_reward": 0.8442642688751221, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3260.0, "completions/mean_length": 459.0091247558594, "completions/mean_terminated_length": 425.5545349121094, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.0208, "grad_norm": 0.0016741788713261485, "learning_rate": 1.0317460317460317e-06, "loss": 0.0117, "num_tokens": 17155732.0, "reward": 0.7468482851982117, "reward_std": 0.47192758321762085, "rewards/accuracy_reward": 0.2825520932674408, "rewards/brier_reward": 0.37127718329429626, "rewards/confidence_one_or_zero": 0.3020833432674408, "rewards/format_reward": 0.83984375, "rewards/mean_confidence_reward": 0.8771354556083679, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 451.81512451171875, "completions/mean_terminated_length": 386.61102294921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0224, "grad_norm": 0.003300126176327467, "learning_rate": 1.111111111111111e-06, "loss": 0.0086, "num_tokens": 18245400.0, "reward": 0.8479976654052734, "reward_std": 0.4430612027645111, "rewards/accuracy_reward": 0.3678385317325592, "rewards/brier_reward": 0.4453202188014984, "rewards/confidence_one_or_zero": 0.318359375, "rewards/format_reward": 0.8828125, "rewards/mean_confidence_reward": 0.8865415453910828, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3486.0, "completions/mean_length": 415.33270263671875, "completions/mean_terminated_length": 386.3510437011719, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.024, "grad_norm": 0.0029791626147925854, "learning_rate": 1.1904761904761906e-06, "loss": 0.0027, "num_tokens": 19275159.0, "reward": 0.8517564535140991, "reward_std": 0.4424428343772888, "rewards/accuracy_reward": 0.3515625, "rewards/brier_reward": 0.4391661584377289, "rewards/confidence_one_or_zero": 0.29296875, "rewards/format_reward": 0.9127604365348816, "rewards/mean_confidence_reward": 0.8963094353675842, "step": 15 }, { "epoch": 0.024, "eval_completions/clipped_ratio": 0.0126953125, "eval_completions/max_length": 3583.5, "eval_completions/max_terminated_length": 2008.875, "eval_completions/mean_length": 481.3715476989746, "eval_completions/mean_terminated_length": 434.84699630737305, "eval_completions/min_length": 84.5, "eval_completions/min_terminated_length": 84.5, "eval_loss": 0.0, "eval_num_tokens": 19275159.0, "eval_reward": 0.8100704252719879, "eval_reward_std": 0.495119359344244, "eval_rewards/accuracy_reward": 0.3134765625, "eval_rewards/brier_reward": 0.4052739255130291, "eval_rewards/confidence_one_or_zero": 0.28125, "eval_rewards/format_reward": 0.9013671875, "eval_rewards/mean_confidence_reward": 0.8801269456744194, "eval_runtime": 230.1383, "eval_samples_per_second": 4.345, "eval_steps_per_second": 0.035, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 497.6966247558594, "completions/mean_terminated_length": 445.4095153808594, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0256, "grad_norm": 0.0026846034452319145, "learning_rate": 1.26984126984127e-06, "loss": 0.0236, "num_tokens": 20446501.0, "reward": 0.80788654088974, "reward_std": 0.44214963912963867, "rewards/accuracy_reward": 0.3151041567325592, "rewards/brier_reward": 0.4087187945842743, "rewards/confidence_one_or_zero": 0.2819010317325592, "rewards/format_reward": 0.8919270634651184, "rewards/mean_confidence_reward": 0.8700169920921326, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3925.0, "completions/mean_length": 420.4231872558594, "completions/mean_terminated_length": 391.48162841796875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.0272, "grad_norm": 0.0046990360133349895, "learning_rate": 1.3492063492063493e-06, "loss": 0.0054, "num_tokens": 21491567.0, "reward": 0.9137815237045288, "reward_std": 0.4410718083381653, "rewards/accuracy_reward": 0.3912760317325592, "rewards/brier_reward": 0.4883469343185425, "rewards/confidence_one_or_zero": 0.265625, "rewards/format_reward": 0.9479166865348816, "rewards/mean_confidence_reward": 0.8984166979789734, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 447.48828125, "completions/mean_terminated_length": 406.6556701660156, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0288, "grad_norm": 0.0028384097386151552, "learning_rate": 1.4285714285714286e-06, "loss": 0.0081, "num_tokens": 22591965.0, "reward": 0.8286608457565308, "reward_std": 0.375907838344574, "rewards/accuracy_reward": 0.29296875, "rewards/brier_reward": 0.40664854645729065, "rewards/confidence_one_or_zero": 0.2135416716337204, "rewards/format_reward": 0.9576823115348816, "rewards/mean_confidence_reward": 0.8898482322692871, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2911.0, "completions/mean_length": 501.3548278808594, "completions/mean_terminated_length": 437.0371398925781, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.0304, "grad_norm": 0.0012362151173874736, "learning_rate": 1.507936507936508e-06, "loss": 0.0137, "num_tokens": 23777182.0, "reward": 0.9273778796195984, "reward_std": 0.4081469476222992, "rewards/accuracy_reward": 0.3951822817325592, "rewards/brier_reward": 0.4999166429042816, "rewards/confidence_one_or_zero": 0.1998697966337204, "rewards/format_reward": 0.9596354365348816, "rewards/mean_confidence_reward": 0.8776745796203613, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01692708333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 559.0306396484375, "completions/mean_terminated_length": 498.129150390625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.032, "grad_norm": 0.00566128920763731, "learning_rate": 1.5873015873015873e-06, "loss": 0.0079, "num_tokens": 25062573.0, "reward": 0.8668637275695801, "reward_std": 0.387631356716156, "rewards/accuracy_reward": 0.3326822817325592, "rewards/brier_reward": 0.44594645500183105, "rewards/confidence_one_or_zero": 0.162109375, "rewards/format_reward": 0.955078125, "rewards/mean_confidence_reward": 0.8737239837646484, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 461.44012451171875, "completions/mean_terminated_length": 428.00787353515625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.0336, "grad_norm": 0.0017165376339107752, "learning_rate": 1.6666666666666667e-06, "loss": 0.0027, "num_tokens": 26183761.0, "reward": 0.873135507106781, "reward_std": 0.33156347274780273, "rewards/accuracy_reward": 0.3118489682674408, "rewards/brier_reward": 0.44937559962272644, "rewards/confidence_one_or_zero": 0.1432291716337204, "rewards/format_reward": 0.9850260615348816, "rewards/mean_confidence_reward": 0.8790267109870911, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3615.0, "completions/mean_length": 502.67645263671875, "completions/mean_terminated_length": 467.2393493652344, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0352, "grad_norm": 0.0008944218861870468, "learning_rate": 1.746031746031746e-06, "loss": 0.0045, "num_tokens": 27379040.0, "reward": 0.9035542011260986, "reward_std": 0.35924404859542847, "rewards/accuracy_reward": 0.3522135317325592, "rewards/brier_reward": 0.47570785880088806, "rewards/confidence_one_or_zero": 0.1451822966337204, "rewards/format_reward": 0.9791666865348816, "rewards/mean_confidence_reward": 0.8729746341705322, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 504.37567138671875, "completions/mean_terminated_length": 471.3383483886719, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.0368, "grad_norm": 0.0009152626735158265, "learning_rate": 1.8253968253968254e-06, "loss": 0.0056, "num_tokens": 28568929.0, "reward": 0.9768198728561401, "reward_std": 0.3738541305065155, "rewards/accuracy_reward": 0.42578125, "rewards/brier_reward": 0.5434637069702148, "rewards/confidence_one_or_zero": 0.1100260391831398, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.8694397807121277, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3831.0, "completions/mean_length": 478.71875, "completions/mean_terminated_length": 435.8260803222656, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.0384, "grad_norm": 0.0013099253410473466, "learning_rate": 1.904761904761905e-06, "loss": 0.009, "num_tokens": 29715025.0, "reward": 1.0156792402267456, "reward_std": 0.3414575755596161, "rewards/accuracy_reward": 0.4635416567325592, "rewards/brier_reward": 0.5840739011764526, "rewards/confidence_one_or_zero": 0.0826822891831398, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.8625586032867432, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01888020833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 589.2916870117188, "completions/mean_terminated_length": 521.8102416992188, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.04, "grad_norm": 0.0009056107955984771, "learning_rate": 1.984126984126984e-06, "loss": 0.0083, "num_tokens": 31039473.0, "reward": 0.9684207439422607, "reward_std": 0.33645710349082947, "rewards/accuracy_reward": 0.4127604067325592, "rewards/brier_reward": 0.5494537949562073, "rewards/confidence_one_or_zero": 0.0598958320915699, "rewards/format_reward": 0.974609375, "rewards/mean_confidence_reward": 0.841796875, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3911.0, "completions/mean_length": 556.99609375, "completions/mean_terminated_length": 493.6739807128906, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.0416, "grad_norm": 0.006632152479141951, "learning_rate": 2.0634920634920634e-06, "loss": 0.0217, "num_tokens": 32313643.0, "reward": 0.970467746257782, "reward_std": 0.3074415326118469, "rewards/accuracy_reward": 0.404296875, "rewards/brier_reward": 0.5574541091918945, "rewards/confidence_one_or_zero": 0.0611979179084301, "rewards/format_reward": 0.9791666865348816, "rewards/mean_confidence_reward": 0.8343679308891296, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3996.0, "completions/mean_length": 552.4759521484375, "completions/mean_terminated_length": 524.5741577148438, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.0432, "grad_norm": 0.000750439940020442, "learning_rate": 2.1428571428571427e-06, "loss": 0.0101, "num_tokens": 33580518.0, "reward": 1.0152522325515747, "reward_std": 0.33530712127685547, "rewards/accuracy_reward": 0.4479166567325592, "rewards/brier_reward": 0.5916847586631775, "rewards/confidence_one_or_zero": 0.0377604179084301, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.8405468463897705, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 449.37762451171875, "completions/mean_terminated_length": 425.48101806640625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.0448, "grad_norm": 0.0014009646838530898, "learning_rate": 2.222222222222222e-06, "loss": 0.0127, "num_tokens": 34683626.0, "reward": 1.1046332120895386, "reward_std": 0.31683462858200073, "rewards/accuracy_reward": 0.552734375, "rewards/brier_reward": 0.6649779081344604, "rewards/confidence_one_or_zero": 0.033203125, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.8467767238616943, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3970.0, "completions/mean_length": 561.5579833984375, "completions/mean_terminated_length": 543.0530395507812, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.0464, "grad_norm": 0.0013355121482163668, "learning_rate": 2.301587301587302e-06, "loss": -0.0029, "num_tokens": 35959523.0, "reward": 1.0695838928222656, "reward_std": 0.3413641154766083, "rewards/accuracy_reward": 0.5078125, "rewards/brier_reward": 0.6378485560417175, "rewards/confidence_one_or_zero": 0.01627604104578495, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.8323665261268616, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4001.0, "completions/mean_length": 534.1875, "completions/mean_terminated_length": 484.81585693359375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.048, "grad_norm": 0.0011198230786249042, "learning_rate": 2.380952380952381e-06, "loss": 0.0157, "num_tokens": 37190691.0, "reward": 1.0358586311340332, "reward_std": 0.2967299222946167, "rewards/accuracy_reward": 0.4635416567325592, "rewards/brier_reward": 0.6244352459907532, "rewards/confidence_one_or_zero": 0.015625, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.8012434840202332, "step": 30 }, { "epoch": 0.048, "eval_completions/clipped_ratio": 0.014873798076923073, "eval_completions/max_length": 4063.0, "eval_completions/max_terminated_length": 2122.625, "eval_completions/mean_length": 566.7019996643066, "eval_completions/mean_terminated_length": 513.2030334472656, "eval_completions/min_length": 152.25, "eval_completions/min_terminated_length": 152.25, "eval_loss": 0.0, "eval_num_tokens": 37190691.0, "eval_reward": 1.066716879606247, "eval_reward_std": 0.42564020305871964, "eval_rewards/accuracy_reward": 0.5009765625, "eval_rewards/brier_reward": 0.6490424796938896, "eval_rewards/confidence_one_or_zero": 0.0146484375, "eval_rewards/format_reward": 0.9833984375, "eval_rewards/mean_confidence_reward": 0.7983046993613243, "eval_runtime": 259.763, "eval_samples_per_second": 3.85, "eval_steps_per_second": 0.031, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01692708333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4093.0, "completions/mean_length": 644.1322021484375, "completions/mean_terminated_length": 584.696044921875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.0496, "grad_norm": 0.0008192353416234255, "learning_rate": 2.4603174603174605e-06, "loss": 0.0151, "num_tokens": 38593614.0, "reward": 1.054540991783142, "reward_std": 0.2906680107116699, "rewards/accuracy_reward": 0.4850260317325592, "rewards/brier_reward": 0.6455240249633789, "rewards/confidence_one_or_zero": 0.01822916604578495, "rewards/format_reward": 0.978515625, "rewards/mean_confidence_reward": 0.7866471409797668, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3748.0, "completions/mean_length": 537.96875, "completions/mean_terminated_length": 512.3042602539062, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.0512, "grad_norm": 0.000897094898391515, "learning_rate": 2.53968253968254e-06, "loss": 0.0053, "num_tokens": 39821758.0, "reward": 1.0466340780258179, "reward_std": 0.29431256651878357, "rewards/accuracy_reward": 0.4654947817325592, "rewards/brier_reward": 0.6375229954719543, "rewards/confidence_one_or_zero": 0.0065104165114462376, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.784238338470459, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01497395833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4090.0, "completions/mean_length": 593.7037963867188, "completions/mean_terminated_length": 540.4633178710938, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.0528, "grad_norm": 0.0005924258730374277, "learning_rate": 2.6190476190476192e-06, "loss": 0.0173, "num_tokens": 41138679.0, "reward": 1.109561800956726, "reward_std": 0.2731212377548218, "rewards/accuracy_reward": 0.548828125, "rewards/brier_reward": 0.6885090470314026, "rewards/confidence_one_or_zero": 0.0032552082557231188, "rewards/format_reward": 0.9817708134651184, "rewards/mean_confidence_reward": 0.7673177719116211, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3513.0, "completions/mean_length": 563.4440307617188, "completions/mean_terminated_length": 530.9500732421875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.0544, "grad_norm": 0.0008059733081609011, "learning_rate": 2.6984126984126986e-06, "loss": 0.0137, "num_tokens": 42433537.0, "reward": 1.149484634399414, "reward_std": 0.2993488311767578, "rewards/accuracy_reward": 0.5891926884651184, "rewards/brier_reward": 0.7201778292655945, "rewards/confidence_one_or_zero": 0.0013020833721384406, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.7677366137504578, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 611.8587646484375, "completions/mean_terminated_length": 586.7272338867188, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.056, "grad_norm": 0.00044726397027261555, "learning_rate": 2.7777777777777783e-06, "loss": 0.0076, "num_tokens": 43798984.0, "reward": 1.223744511604309, "reward_std": 0.21819576621055603, "rewards/accuracy_reward": 0.6946614384651184, "rewards/brier_reward": 0.7606249451637268, "rewards/confidence_one_or_zero": 0.0026041667442768812, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.749804675579071, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 591.2780151367188, "completions/mean_terminated_length": 577.5339965820312, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.0576, "grad_norm": 0.00045538871199823916, "learning_rate": 2.8571428571428573e-06, "loss": 0.005, "num_tokens": 45119219.0, "reward": 1.2407963275909424, "reward_std": 0.22135639190673828, "rewards/accuracy_reward": 0.703125, "rewards/brier_reward": 0.7869164347648621, "rewards/confidence_one_or_zero": 0.001953125, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.7386783957481384, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3980.0, "completions/mean_length": 614.1087646484375, "completions/mean_terminated_length": 593.5867919921875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.0592, "grad_norm": 0.0005006847204640508, "learning_rate": 2.936507936507937e-06, "loss": 0.0086, "num_tokens": 46490266.0, "reward": 1.1310484409332275, "reward_std": 0.23997369408607483, "rewards/accuracy_reward": 0.5520833134651184, "rewards/brier_reward": 0.7197647094726562, "rewards/confidence_one_or_zero": 0.0013020833721384406, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.7217578291893005, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 681.8431396484375, "completions/mean_terminated_length": 613.83203125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.0608, "grad_norm": 0.0004756015259772539, "learning_rate": 3.015873015873016e-06, "loss": 0.0177, "num_tokens": 47953257.0, "reward": 1.176099181175232, "reward_std": 0.25022947788238525, "rewards/accuracy_reward": 0.6373698115348816, "rewards/brier_reward": 0.7415072321891785, "rewards/confidence_one_or_zero": 0.0013020833721384406, "rewards/format_reward": 0.9733073115348816, "rewards/mean_confidence_reward": 0.705273449420929, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 681.5807495117188, "completions/mean_terminated_length": 645.6395263671875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.0624, "grad_norm": 0.0005206615896895528, "learning_rate": 3.0952380952380957e-06, "loss": 0.0165, "num_tokens": 49416581.0, "reward": 1.1074779033660889, "reward_std": 0.2743716239929199, "rewards/accuracy_reward": 0.521484375, "rewards/brier_reward": 0.7064778208732605, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.7094400525093079, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01627604166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 721.1354370117188, "completions/mean_terminated_length": 665.297119140625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.064, "grad_norm": 0.00046394849778153, "learning_rate": 3.1746031746031746e-06, "loss": 0.0223, "num_tokens": 50951157.0, "reward": 1.1552793979644775, "reward_std": 0.2623785734176636, "rewards/accuracy_reward": 0.5970051884651184, "rewards/brier_reward": 0.7324197292327881, "rewards/confidence_one_or_zero": 0.0006510416860692203, "rewards/format_reward": 0.9811198115348816, "rewards/mean_confidence_reward": 0.6952798962593079, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 650.7650146484375, "completions/mean_terminated_length": 621.357177734375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.0656, "grad_norm": 0.000413050438510254, "learning_rate": 3.2539682539682544e-06, "loss": 0.014, "num_tokens": 52357804.0, "reward": 1.2222139835357666, "reward_std": 0.20803579688072205, "rewards/accuracy_reward": 0.685546875, "rewards/brier_reward": 0.771888017654419, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.6841146349906921, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 713.5078125, "completions/mean_terminated_length": 693.5717163085938, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.0672, "grad_norm": 0.0004045895184390247, "learning_rate": 3.3333333333333333e-06, "loss": 0.0086, "num_tokens": 53883448.0, "reward": 1.1814237833023071, "reward_std": 0.23944079875946045, "rewards/accuracy_reward": 0.62109375, "rewards/brier_reward": 0.7508544921875, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.6888997554779053, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 633.2474365234375, "completions/mean_terminated_length": 601.3955078125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.0688, "grad_norm": 0.00037209762376733124, "learning_rate": 3.412698412698413e-06, "loss": 0.0153, "num_tokens": 55261684.0, "reward": 1.2000703811645508, "reward_std": 0.18214035034179688, "rewards/accuracy_reward": 0.6497395634651184, "rewards/brier_reward": 0.7608039975166321, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.6770833134651184, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3976.0, "completions/mean_length": 747.1744995117188, "completions/mean_terminated_length": 698.5125732421875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.0704, "grad_norm": 0.00043484216439537704, "learning_rate": 3.492063492063492e-06, "loss": 0.0129, "num_tokens": 56822016.0, "reward": 1.1056984663009644, "reward_std": 0.23141507804393768, "rewards/accuracy_reward": 0.5234375, "rewards/brier_reward": 0.7081282734870911, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9798176884651184, "rewards/mean_confidence_reward": 0.666015625, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 733.0690307617188, "completions/mean_terminated_length": 702.1353149414062, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.072, "grad_norm": 0.0003432696685194969, "learning_rate": 3.5714285714285718e-06, "loss": 0.014, "num_tokens": 58353162.0, "reward": 1.197433352470398, "reward_std": 0.18480508029460907, "rewards/accuracy_reward": 0.6471354365348816, "rewards/brier_reward": 0.761389970779419, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.986328125, "rewards/mean_confidence_reward": 0.6524739861488342, "step": 45 }, { "epoch": 0.072, "eval_completions/clipped_ratio": 0.016301081730769232, "eval_completions/max_length": 3859.25, "eval_completions/max_terminated_length": 2522.875, "eval_completions/mean_length": 724.5620498657227, "eval_completions/mean_terminated_length": 668.658821105957, "eval_completions/min_length": 242.375, "eval_completions/min_terminated_length": 242.375, "eval_loss": 0.0, "eval_num_tokens": 58353162.0, "eval_reward": 1.1887515634298325, "eval_reward_std": 0.34329691156744957, "eval_rewards/accuracy_reward": 0.6396484375, "eval_rewards/brier_reward": 0.755419909954071, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.982421875, "eval_rewards/mean_confidence_reward": 0.6421875208616257, "eval_runtime": 254.1756, "eval_samples_per_second": 3.934, "eval_steps_per_second": 0.031, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 751.6478271484375, "completions/mean_terminated_length": 691.8085327148438, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.0736, "grad_norm": 0.00037752572097815573, "learning_rate": 3.6507936507936507e-06, "loss": 0.019, "num_tokens": 59914541.0, "reward": 1.1253092288970947, "reward_std": 0.2276439368724823, "rewards/accuracy_reward": 0.5455729365348816, "rewards/brier_reward": 0.7252147793769836, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9798176884651184, "rewards/mean_confidence_reward": 0.6427083611488342, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02473958333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 839.599609375, "completions/mean_terminated_length": 756.9940185546875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.0752, "grad_norm": 0.000350569054717198, "learning_rate": 3.7301587301587305e-06, "loss": 0.0246, "num_tokens": 61620102.0, "reward": 1.155785083770752, "reward_std": 0.22652345895767212, "rewards/accuracy_reward": 0.599609375, "rewards/brier_reward": 0.7431982159614563, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.96875, "rewards/mean_confidence_reward": 0.6193034052848816, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 621.4896240234375, "completions/mean_terminated_length": 610.1423950195312, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.0768, "grad_norm": 0.0005488215247169137, "learning_rate": 3.80952380952381e-06, "loss": 0.0052, "num_tokens": 62979638.0, "reward": 1.1818220615386963, "reward_std": 0.19564279913902283, "rewards/accuracy_reward": 0.6126301884651184, "rewards/brier_reward": 0.7549071311950684, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6345377564430237, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 719.66796875, "completions/mean_terminated_length": 670.6063842773438, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.0784, "grad_norm": 0.0003660090151242912, "learning_rate": 3.88888888888889e-06, "loss": 0.0147, "num_tokens": 64493816.0, "reward": 1.2375762462615967, "reward_std": 0.19935885071754456, "rewards/accuracy_reward": 0.7180989384651184, "rewards/brier_reward": 0.7765722274780273, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98046875, "rewards/mean_confidence_reward": 0.6166015863418579, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 677.2962646484375, "completions/mean_terminated_length": 652.63671875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.08, "grad_norm": 0.00030916326795704663, "learning_rate": 3.968253968253968e-06, "loss": 0.0118, "num_tokens": 65945855.0, "reward": 1.2285537719726562, "reward_std": 0.15463696420192719, "rewards/accuracy_reward": 0.6868489384651184, "rewards/brier_reward": 0.7800113558769226, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.6314127445220947, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01302083333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 733.64453125, "completions/mean_terminated_length": 689.2863159179688, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.0816, "grad_norm": 0.00038590619806200266, "learning_rate": 4.047619047619048e-06, "loss": 0.0118, "num_tokens": 67470717.0, "reward": 1.1864783763885498, "reward_std": 0.20442676544189453, "rewards/accuracy_reward": 0.634765625, "rewards/brier_reward": 0.7551057934761047, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9830729365348816, "rewards/mean_confidence_reward": 0.6096028685569763, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3936.0, "completions/mean_length": 790.0579833984375, "completions/mean_terminated_length": 755.2586059570312, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.0832, "grad_norm": 0.00034537652391009033, "learning_rate": 4.126984126984127e-06, "loss": 0.0156, "num_tokens": 69106870.0, "reward": 1.2041394710540771, "reward_std": 0.20422860980033875, "rewards/accuracy_reward": 0.6640625, "rewards/brier_reward": 0.7604801058769226, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.6060221791267395, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 732.2005615234375, "completions/mean_terminated_length": 705.7139282226562, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.0848, "grad_norm": 0.00031076246523298323, "learning_rate": 4.206349206349207e-06, "loss": 0.0086, "num_tokens": 70640138.0, "reward": 1.2310808897018433, "reward_std": 0.17096857726573944, "rewards/accuracy_reward": 0.7005208134651184, "rewards/brier_reward": 0.7720453143119812, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.6123567819595337, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 773.2037963867188, "completions/mean_terminated_length": 700.2481689453125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.0864, "grad_norm": 0.0004118096549063921, "learning_rate": 4.2857142857142855e-06, "loss": 0.0255, "num_tokens": 72235939.0, "reward": 1.2178421020507812, "reward_std": 0.20634141564369202, "rewards/accuracy_reward": 0.6920573115348816, "rewards/brier_reward": 0.7664013504981995, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9772135615348816, "rewards/mean_confidence_reward": 0.5948893427848816, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 754.4752807617188, "completions/mean_terminated_length": 719.3013305664062, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.088, "grad_norm": 0.00031511677661910653, "learning_rate": 4.365079365079366e-06, "loss": 0.0095, "num_tokens": 73812093.0, "reward": 1.189154863357544, "reward_std": 0.157638281583786, "rewards/accuracy_reward": 0.626953125, "rewards/brier_reward": 0.7643652558326721, "rewards/confidence_one_or_zero": 0.0006510416860692203, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.6063802242279053, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3724.0, "completions/mean_length": 746.58203125, "completions/mean_terminated_length": 742.215087890625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.0896, "grad_norm": 0.000316188350552693, "learning_rate": 4.444444444444444e-06, "loss": 0.0031, "num_tokens": 75374139.0, "reward": 1.1907134056091309, "reward_std": 0.17242440581321716, "rewards/accuracy_reward": 0.6243489384651184, "rewards/brier_reward": 0.75901859998703, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6205403804779053, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 833.6829833984375, "completions/mean_terminated_length": 775.3115234375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.0912, "grad_norm": 0.0005548478802666068, "learning_rate": 4.523809523809524e-06, "loss": 0.015, "num_tokens": 77075540.0, "reward": 1.1708582639694214, "reward_std": 0.19103187322616577, "rewards/accuracy_reward": 0.6158854365348816, "rewards/brier_reward": 0.7473030090332031, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.978515625, "rewards/mean_confidence_reward": 0.6142252683639526, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01497395833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 767.5084838867188, "completions/mean_terminated_length": 716.91015625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.0928, "grad_norm": 0.0004442105127964169, "learning_rate": 4.603174603174604e-06, "loss": 0.0189, "num_tokens": 78666529.0, "reward": 1.2187408208847046, "reward_std": 0.17558959126472473, "rewards/accuracy_reward": 0.693359375, "rewards/brier_reward": 0.7629899382591248, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9811198115348816, "rewards/mean_confidence_reward": 0.6183919310569763, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3934.0, "completions/mean_length": 830.0111083984375, "completions/mean_terminated_length": 802.13330078125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.0944, "grad_norm": 0.00031338928965851665, "learning_rate": 4.682539682539683e-06, "loss": 0.0112, "num_tokens": 80378802.0, "reward": 1.2067129611968994, "reward_std": 0.1816485971212387, "rewards/accuracy_reward": 0.658203125, "rewards/brier_reward": 0.7649755477905273, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.6279622912406921, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 807.1002807617188, "completions/mean_terminated_length": 781.2034301757812, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.096, "grad_norm": 0.0004219993425067514, "learning_rate": 4.761904761904762e-06, "loss": 0.0095, "num_tokens": 82026988.0, "reward": 1.2029954195022583, "reward_std": 0.18720337748527527, "rewards/accuracy_reward": 0.6516926884651184, "rewards/brier_reward": 0.7647021412849426, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.62646484375, "step": 60 }, { "epoch": 0.096, "eval_completions/clipped_ratio": 0.0107421875, "eval_completions/max_length": 3320.75, "eval_completions/max_terminated_length": 2688.0, "eval_completions/mean_length": 822.0586700439453, "eval_completions/mean_terminated_length": 786.686653137207, "eval_completions/min_length": 302.125, "eval_completions/min_terminated_length": 302.125, "eval_loss": 0.0, "eval_num_tokens": 82026988.0, "eval_reward": 1.1983986347913742, "eval_reward_std": 0.32616620138287544, "eval_rewards/accuracy_reward": 0.6455078125, "eval_rewards/brier_reward": 0.7629956007003784, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.98828125, "eval_rewards/mean_confidence_reward": 0.6270019635558128, "eval_runtime": 219.1271, "eval_samples_per_second": 4.564, "eval_steps_per_second": 0.037, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4088.0, "completions/mean_length": 774.2884521484375, "completions/mean_terminated_length": 759.0810546875, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.0976, "grad_norm": 0.00036590738454833627, "learning_rate": 4.841269841269842e-06, "loss": 0.004, "num_tokens": 83639783.0, "reward": 1.2302050590515137, "reward_std": 0.1759772002696991, "rewards/accuracy_reward": 0.6868489384651184, "rewards/brier_reward": 0.7787564396858215, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.6401041746139526, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3996.0, "completions/mean_length": 834.9935302734375, "completions/mean_terminated_length": 796.325439453125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.0992, "grad_norm": 0.00037851225351914763, "learning_rate": 4.920634920634921e-06, "loss": 0.0142, "num_tokens": 85337309.0, "reward": 1.2450284957885742, "reward_std": 0.17814569175243378, "rewards/accuracy_reward": 0.7161458134651184, "rewards/brier_reward": 0.7888720631599426, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9850260615348816, "rewards/mean_confidence_reward": 0.6385742425918579, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 847.052734375, "completions/mean_terminated_length": 819.3204345703125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.1008, "grad_norm": 0.000289404415525496, "learning_rate": 5e-06, "loss": 0.0082, "num_tokens": 87061646.0, "reward": 1.2249886989593506, "reward_std": 0.16176146268844604, "rewards/accuracy_reward": 0.685546875, "rewards/brier_reward": 0.7761361002922058, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.645703136920929, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 812.046875, "completions/mean_terminated_length": 773.1067504882812, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.1024, "grad_norm": 0.00033190601971000433, "learning_rate": 4.980000000000001e-06, "loss": 0.0152, "num_tokens": 88725814.0, "reward": 1.2828524112701416, "reward_std": 0.15639185905456543, "rewards/accuracy_reward": 0.7740885615348816, "rewards/brier_reward": 0.8059260845184326, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.6543294191360474, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 796.6295776367188, "completions/mean_terminated_length": 781.5244750976562, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.104, "grad_norm": 0.00039051726344041526, "learning_rate": 4.960000000000001e-06, "loss": 0.0065, "num_tokens": 90357821.0, "reward": 1.2205055952072144, "reward_std": 0.182016059756279, "rewards/accuracy_reward": 0.669921875, "rewards/brier_reward": 0.7756331562995911, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.6615560054779053, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 951.28125, "completions/mean_terminated_length": 905.585205078125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.1056, "grad_norm": 0.0003536671574693173, "learning_rate": 4.94e-06, "loss": 0.0139, "num_tokens": 92236461.0, "reward": 1.1203354597091675, "reward_std": 0.2144804447889328, "rewards/accuracy_reward": 0.5377604365348816, "rewards/brier_reward": 0.7204751968383789, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.982421875, "rewards/mean_confidence_reward": 0.6655599474906921, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3140.0, "completions/mean_length": 878.3834838867188, "completions/mean_terminated_length": 859.4191284179688, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.1072, "grad_norm": 0.00036600593011826277, "learning_rate": 4.92e-06, "loss": 0.0076, "num_tokens": 94008122.0, "reward": 1.1621812582015991, "reward_std": 0.16693732142448425, "rewards/accuracy_reward": 0.5794270634651184, "rewards/brier_reward": 0.7507812976837158, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.6688151359558105, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 813.7623901367188, "completions/mean_terminated_length": 792.2536010742188, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.1088, "grad_norm": 0.0003499346203170717, "learning_rate": 4.9000000000000005e-06, "loss": 0.008, "num_tokens": 95677613.0, "reward": 1.2330586910247803, "reward_std": 0.18279102444648743, "rewards/accuracy_reward": 0.6901041865348816, "rewards/brier_reward": 0.7838118672370911, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.6787760853767395, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3078.0, "completions/mean_length": 901.0221557617188, "completions/mean_terminated_length": 884.2944946289062, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.1104, "grad_norm": 0.0005581220029853284, "learning_rate": 4.880000000000001e-06, "loss": 0.003, "num_tokens": 97473679.0, "reward": 1.1641027927398682, "reward_std": 0.16793328523635864, "rewards/accuracy_reward": 0.5885416865348816, "rewards/brier_reward": 0.7455094456672668, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.6837565302848816, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 894.3209838867188, "completions/mean_terminated_length": 869.1109008789062, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.112, "grad_norm": 0.00031328792101703584, "learning_rate": 4.86e-06, "loss": 0.0119, "num_tokens": 99260860.0, "reward": 1.1893843412399292, "reward_std": 0.19155435264110565, "rewards/accuracy_reward": 0.6217448115348816, "rewards/brier_reward": 0.7661247253417969, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.6818684935569763, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 796.6048583984375, "completions/mean_terminated_length": 790.1480712890625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.1136, "grad_norm": 0.00038486041012220085, "learning_rate": 4.84e-06, "loss": 0.0005, "num_tokens": 100891645.0, "reward": 1.2316420078277588, "reward_std": 0.16194000840187073, "rewards/accuracy_reward": 0.6790364384651184, "rewards/brier_reward": 0.7861865162849426, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6955403685569763, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3645.0, "completions/mean_length": 821.5286865234375, "completions/mean_terminated_length": 815.1206665039062, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.1152, "grad_norm": 0.00034173615858890116, "learning_rate": 4.8200000000000004e-06, "loss": 0.0039, "num_tokens": 102553673.0, "reward": 1.2730555534362793, "reward_std": 0.17514729499816895, "rewards/accuracy_reward": 0.74609375, "rewards/brier_reward": 0.8026074767112732, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7026041150093079, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3920.0, "completions/mean_length": 794.8470458984375, "completions/mean_terminated_length": 771.035400390625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.1168, "grad_norm": 0.0004782153991982341, "learning_rate": 4.800000000000001e-06, "loss": 0.0102, "num_tokens": 104178334.0, "reward": 1.1591476202011108, "reward_std": 0.16143248975276947, "rewards/accuracy_reward": 0.5852864384651184, "rewards/brier_reward": 0.7414583563804626, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.6977213025093079, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3663.0, "completions/mean_length": 840.041015625, "completions/mean_terminated_length": 803.6016845703125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.1184, "grad_norm": 0.00032794912112876773, "learning_rate": 4.78e-06, "loss": 0.013, "num_tokens": 105890205.0, "reward": 1.215286374092102, "reward_std": 0.1828882396221161, "rewards/accuracy_reward": 0.6731770634651184, "rewards/brier_reward": 0.7710530757904053, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.986328125, "rewards/mean_confidence_reward": 0.707714855670929, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 844.6361083984375, "completions/mean_terminated_length": 816.8831176757812, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.12, "grad_norm": 0.00036955776158720255, "learning_rate": 4.76e-06, "loss": 0.0062, "num_tokens": 107615662.0, "reward": 1.1810650825500488, "reward_std": 0.18503007292747498, "rewards/accuracy_reward": 0.6158854365348816, "rewards/brier_reward": 0.7553451061248779, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.701367199420929, "step": 75 }, { "epoch": 0.12, "eval_completions/clipped_ratio": 0.00390625, "eval_completions/max_length": 2761.875, "eval_completions/max_terminated_length": 2384.125, "eval_completions/mean_length": 806.0199127197266, "eval_completions/mean_terminated_length": 793.3070449829102, "eval_completions/min_length": 269.0, "eval_completions/min_terminated_length": 269.0, "eval_loss": 0.0, "eval_num_tokens": 107615662.0, "eval_reward": 1.2160824984312057, "eval_reward_std": 0.3406365439295769, "eval_rewards/accuracy_reward": 0.6591796875, "eval_rewards/brier_reward": 0.7788305580615997, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.994140625, "eval_rewards/mean_confidence_reward": 0.7031738236546516, "eval_runtime": 185.1053, "eval_samples_per_second": 5.402, "eval_steps_per_second": 0.043, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3155.0, "completions/mean_length": 792.61328125, "completions/mean_terminated_length": 779.6588745117188, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.1216, "grad_norm": 0.0003120205656159669, "learning_rate": 4.74e-06, "loss": 0.0056, "num_tokens": 109235228.0, "reward": 1.2159723043441772, "reward_std": 0.1362423151731491, "rewards/accuracy_reward": 0.6588541865348816, "rewards/brier_reward": 0.7769824862480164, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.7095702290534973, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3858.0, "completions/mean_length": 765.7741088867188, "completions/mean_terminated_length": 754.8980712890625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.1232, "grad_norm": 0.0003779090184252709, "learning_rate": 4.7200000000000005e-06, "loss": 0.0066, "num_tokens": 110822625.0, "reward": 1.2107127904891968, "reward_std": 0.19966839253902435, "rewards/accuracy_reward": 0.6555989384651184, "rewards/brier_reward": 0.7690674662590027, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.7091471552848816, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3092.0, "completions/mean_length": 821.7103271484375, "completions/mean_terminated_length": 813.1612548828125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.1248, "grad_norm": 0.0003351967316120863, "learning_rate": 4.7e-06, "loss": 0.0047, "num_tokens": 112496772.0, "reward": 1.1657956838607788, "reward_std": 0.1642550379037857, "rewards/accuracy_reward": 0.5904948115348816, "rewards/brier_reward": 0.7436864972114563, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7021159529685974, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 843.7747802734375, "completions/mean_terminated_length": 818.1666870117188, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.1264, "grad_norm": 0.00031675034551881254, "learning_rate": 4.680000000000001e-06, "loss": 0.0085, "num_tokens": 114195882.0, "reward": 1.216921091079712, "reward_std": 0.15486617386341095, "rewards/accuracy_reward": 0.6647135615348816, "rewards/brier_reward": 0.7788802981376648, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.6950520873069763, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3962.0, "completions/mean_length": 791.5364990234375, "completions/mean_terminated_length": 785.06982421875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.128, "grad_norm": 0.0003540362522471696, "learning_rate": 4.66e-06, "loss": 0.0021, "num_tokens": 115829282.0, "reward": 1.2249581813812256, "reward_std": 0.14827406406402588, "rewards/accuracy_reward": 0.6595051884651184, "rewards/brier_reward": 0.7930013537406921, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6871744990348816, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 820.7819213867188, "completions/mean_terminated_length": 807.9379272460938, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.1296, "grad_norm": 0.00040688845911063254, "learning_rate": 4.6400000000000005e-06, "loss": 0.012, "num_tokens": 117501523.0, "reward": 1.2625168561935425, "reward_std": 0.19915097951889038, "rewards/accuracy_reward": 0.7259114384651184, "rewards/brier_reward": 0.8036653995513916, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.6981770992279053, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 900.1139526367188, "completions/mean_terminated_length": 877.0616455078125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.1312, "grad_norm": 0.00029592218925245106, "learning_rate": 4.620000000000001e-06, "loss": 0.0083, "num_tokens": 119300962.0, "reward": 1.1935145854949951, "reward_std": 0.17606544494628906, "rewards/accuracy_reward": 0.634765625, "rewards/brier_reward": 0.7620150446891785, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.6895182132720947, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01302083333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3987.0, "completions/mean_length": 843.0072021484375, "completions/mean_terminated_length": 800.0917358398438, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.1328, "grad_norm": 0.000340108061209321, "learning_rate": 4.600000000000001e-06, "loss": 0.0082, "num_tokens": 121012141.0, "reward": 1.215527057647705, "reward_std": 0.1809983104467392, "rewards/accuracy_reward": 0.6692708134651184, "rewards/brier_reward": 0.7767431735992432, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9850260615348816, "rewards/mean_confidence_reward": 0.6902669072151184, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 817.2174682617188, "completions/mean_terminated_length": 808.6566772460938, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.1344, "grad_norm": 0.00032137910602614284, "learning_rate": 4.58e-06, "loss": 0.0034, "num_tokens": 122677051.0, "reward": 1.175952672958374, "reward_std": 0.16854238510131836, "rewards/accuracy_reward": 0.6015625, "rewards/brier_reward": 0.7529329657554626, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6953775882720947, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 830.013671875, "completions/mean_terminated_length": 808.6114501953125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.136, "grad_norm": 0.00037100460031069815, "learning_rate": 4.56e-06, "loss": 0.0102, "num_tokens": 124358672.0, "reward": 1.1608409881591797, "reward_std": 0.2080398052930832, "rewards/accuracy_reward": 0.5787760615348816, "rewards/brier_reward": 0.7513557076454163, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.6830403804779053, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3111.0, "completions/mean_length": 739.7897338867188, "completions/mean_terminated_length": 735.4139404296875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.1376, "grad_norm": 0.00031246646540239453, "learning_rate": 4.540000000000001e-06, "loss": 0.0016, "num_tokens": 125896781.0, "reward": 1.2316111326217651, "reward_std": 0.15129771828651428, "rewards/accuracy_reward": 0.6783854365348816, "rewards/brier_reward": 0.7861247062683105, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.699902355670929, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3963.0, "completions/mean_length": 736.2311401367188, "completions/mean_terminated_length": 729.65625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.1392, "grad_norm": 0.0003998448664788157, "learning_rate": 4.520000000000001e-06, "loss": 0.0027, "num_tokens": 127443344.0, "reward": 1.2307980060577393, "reward_std": 0.1459086388349533, "rewards/accuracy_reward": 0.6731770634651184, "rewards/brier_reward": 0.7903580665588379, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6911458373069763, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3720.0, "completions/mean_length": 766.1966552734375, "completions/mean_terminated_length": 746.571044921875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.1408, "grad_norm": 0.00032334847492165864, "learning_rate": 4.5e-06, "loss": 0.0084, "num_tokens": 129030494.0, "reward": 1.2057101726531982, "reward_std": 0.16147273778915405, "rewards/accuracy_reward": 0.642578125, "rewards/brier_reward": 0.7746874690055847, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.6915364265441895, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3957.0, "completions/mean_length": 889.3099365234375, "completions/mean_terminated_length": 876.7346801757812, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.1424, "grad_norm": 0.00035930072772316635, "learning_rate": 4.48e-06, "loss": 0.0077, "num_tokens": 130817882.0, "reward": 1.1389009952545166, "reward_std": 0.22972917556762695, "rewards/accuracy_reward": 0.541015625, "rewards/brier_reward": 0.7406787276268005, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6819987297058105, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 823.9739990234375, "completions/mean_terminated_length": 811.1425170898438, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.144, "grad_norm": 0.0002881527179852128, "learning_rate": 4.4600000000000005e-06, "loss": 0.0072, "num_tokens": 132504434.0, "reward": 1.2359681129455566, "reward_std": 0.1662517935037613, "rewards/accuracy_reward": 0.6848958134651184, "rewards/brier_reward": 0.7909326553344727, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.690136730670929, "step": 90 }, { "epoch": 0.144, "eval_completions/clipped_ratio": 0.005108173076923073, "eval_completions/max_length": 3164.125, "eval_completions/max_terminated_length": 2423.75, "eval_completions/mean_length": 762.0588989257812, "eval_completions/mean_terminated_length": 744.9603881835938, "eval_completions/min_length": 274.25, "eval_completions/min_terminated_length": 274.25, "eval_loss": 0.0, "eval_num_tokens": 132504434.0, "eval_reward": 1.2256001979112625, "eval_reward_std": 0.3334697335958481, "eval_rewards/accuracy_reward": 0.6728515625, "eval_rewards/brier_reward": 0.7851709201931953, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9931640625, "eval_rewards/mean_confidence_reward": 0.6870117112994194, "eval_runtime": 208.4984, "eval_samples_per_second": 4.796, "eval_steps_per_second": 0.038, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 796.34765625, "completions/mean_terminated_length": 787.7323608398438, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.1456, "grad_norm": 0.0003802562423516065, "learning_rate": 4.440000000000001e-06, "loss": 0.004, "num_tokens": 134158152.0, "reward": 1.1470496654510498, "reward_std": 0.17231294512748718, "rewards/accuracy_reward": 0.5546875, "rewards/brier_reward": 0.742002010345459, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6843098998069763, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 727.6764526367188, "completions/mean_terminated_length": 712.2556762695312, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.1472, "grad_norm": 0.00036934087984263897, "learning_rate": 4.42e-06, "loss": 0.0097, "num_tokens": 135682199.0, "reward": 1.2537178993225098, "reward_std": 0.1568048894405365, "rewards/accuracy_reward": 0.712890625, "rewards/brier_reward": 0.8003906607627869, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.6870443224906921, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3939.0, "completions/mean_length": 711.9251708984375, "completions/mean_terminated_length": 705.3026733398438, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.1488, "grad_norm": 0.00036421205732040107, "learning_rate": 4.4e-06, "loss": 0.0037, "num_tokens": 137181540.0, "reward": 1.2532085180282593, "reward_std": 0.15252947807312012, "rewards/accuracy_reward": 0.712890625, "rewards/brier_reward": 0.7961165308952332, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6910156607627869, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 743.2994995117188, "completions/mean_terminated_length": 736.7384643554688, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.1504, "grad_norm": 0.0002964306331705302, "learning_rate": 4.38e-06, "loss": 0.001, "num_tokens": 138735888.0, "reward": 1.2554879188537598, "reward_std": 0.12074218690395355, "rewards/accuracy_reward": 0.7057291865348816, "rewards/brier_reward": 0.8071858286857605, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6804361343383789, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 783.3236083984375, "completions/mean_terminated_length": 779.0045776367188, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.152, "grad_norm": 0.0002860700769815594, "learning_rate": 4.360000000000001e-06, "loss": 0.0015, "num_tokens": 140345313.0, "reward": 1.1955863237380981, "reward_std": 0.13240596652030945, "rewards/accuracy_reward": 0.6165364384651184, "rewards/brier_reward": 0.7759245038032532, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6802734732627869, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3503.0, "completions/mean_length": 753.966796875, "completions/mean_terminated_length": 749.6094970703125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.1536, "grad_norm": 0.00034035355201922357, "learning_rate": 4.34e-06, "loss": 0.0066, "num_tokens": 141921294.0, "reward": 1.2735555171966553, "reward_std": 0.15444736182689667, "rewards/accuracy_reward": 0.73828125, "rewards/brier_reward": 0.8101181983947754, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6817838549613953, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3494.0, "completions/mean_length": 764.240234375, "completions/mean_terminated_length": 733.59326171875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.1552, "grad_norm": 0.0003727012372110039, "learning_rate": 4.32e-06, "loss": 0.0147, "num_tokens": 143496031.0, "reward": 1.2105886936187744, "reward_std": 0.2027515172958374, "rewards/accuracy_reward": 0.6529948115348816, "rewards/brier_reward": 0.7779345512390137, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.6689128279685974, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 753.412109375, "completions/mean_terminated_length": 742.4957275390625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.1568, "grad_norm": 0.0003599193296395242, "learning_rate": 4.3e-06, "loss": 0.0039, "num_tokens": 145064440.0, "reward": 1.158656358718872, "reward_std": 0.1604929268360138, "rewards/accuracy_reward": 0.5709635615348816, "rewards/brier_reward": 0.750241756439209, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6777734160423279, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2321.0, "completions/mean_length": 705.1686401367188, "completions/mean_terminated_length": 696.3153076171875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1584, "grad_norm": 0.00033617127337493, "learning_rate": 4.2800000000000005e-06, "loss": 0.0048, "num_tokens": 146559707.0, "reward": 1.282581090927124, "reward_std": 0.1384386420249939, "rewards/accuracy_reward": 0.7532551884651184, "rewards/brier_reward": 0.8144974708557129, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6700976490974426, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 703.4694213867188, "completions/mean_terminated_length": 696.8303833007812, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.16, "grad_norm": 0.0003312737389933318, "learning_rate": 4.26e-06, "loss": 0.0029, "num_tokens": 148043852.0, "reward": 1.232764720916748, "reward_std": 0.1461983025074005, "rewards/accuracy_reward": 0.6712239384651184, "rewards/brier_reward": 0.7962450981140137, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6718425154685974, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2602.0, "completions/mean_length": 759.8659057617188, "completions/mean_terminated_length": 751.1553344726562, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.1616, "grad_norm": 0.0003373540530446917, "learning_rate": 4.24e-06, "loss": 0.004, "num_tokens": 149628734.0, "reward": 1.1811647415161133, "reward_std": 0.16661325097084045, "rewards/accuracy_reward": 0.6002604365348816, "rewards/brier_reward": 0.7646598815917969, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6603190302848816, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 773.7396240234375, "completions/mean_terminated_length": 756.3455810546875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.1632, "grad_norm": 0.00033894009538926184, "learning_rate": 4.22e-06, "loss": 0.0093, "num_tokens": 151236046.0, "reward": 1.206495761871338, "reward_std": 0.17319798469543457, "rewards/accuracy_reward": 0.64453125, "rewards/brier_reward": 0.7736555933952332, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.6418619751930237, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 691.4869995117188, "completions/mean_terminated_length": 680.3683471679688, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.1648, "grad_norm": 0.0004259241686668247, "learning_rate": 4.2000000000000004e-06, "loss": 0.0047, "num_tokens": 152700538.0, "reward": 1.1941465139389038, "reward_std": 0.17021052539348602, "rewards/accuracy_reward": 0.6197916865348816, "rewards/brier_reward": 0.77174311876297, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.6581054329872131, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3389.0, "completions/mean_length": 734.6517333984375, "completions/mean_terminated_length": 721.469970703125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.1664, "grad_norm": 0.0004006985400337726, "learning_rate": 4.18e-06, "loss": 0.0031, "num_tokens": 154249891.0, "reward": 1.208742618560791, "reward_std": 0.177547425031662, "rewards/accuracy_reward": 0.6393229365348816, "rewards/brier_reward": 0.7827066779136658, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.6377604007720947, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3881.0, "completions/mean_length": 736.9056396484375, "completions/mean_terminated_length": 703.7784423828125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.168, "grad_norm": 0.0003371264028828591, "learning_rate": 4.16e-06, "loss": 0.0143, "num_tokens": 155786866.0, "reward": 1.2196658849716187, "reward_std": 0.15464960038661957, "rewards/accuracy_reward": 0.6627604365348816, "rewards/brier_reward": 0.7869752049446106, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.6406314969062805, "step": 105 }, { "epoch": 0.168, "eval_completions/clipped_ratio": 0.0029296875, "eval_completions/max_length": 3122.0, "eval_completions/max_terminated_length": 2342.625, "eval_completions/mean_length": 727.8534469604492, "eval_completions/mean_terminated_length": 718.0156936645508, "eval_completions/min_length": 281.75, "eval_completions/min_terminated_length": 281.75, "eval_loss": 0.0, "eval_num_tokens": 155786866.0, "eval_reward": 1.2247526347637177, "eval_reward_std": 0.3122938834130764, "eval_rewards/accuracy_reward": 0.669921875, "eval_rewards/brier_reward": 0.7834765538573265, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.99609375, "eval_rewards/mean_confidence_reward": 0.6519531384110451, "eval_runtime": 205.2195, "eval_samples_per_second": 4.873, "eval_steps_per_second": 0.039, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 670.09765625, "completions/mean_terminated_length": 667.8657836914062, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.1696, "grad_norm": 0.0004285164759494364, "learning_rate": 4.14e-06, "loss": 0.0033, "num_tokens": 157223688.0, "reward": 1.2618751525878906, "reward_std": 0.18720659613609314, "rewards/accuracy_reward": 0.7161458134651184, "rewards/brier_reward": 0.80824214220047, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6656250357627869, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3728.0, "completions/mean_length": 697.5494995117188, "completions/mean_terminated_length": 693.11865234375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.1712, "grad_norm": 0.0003714995982591063, "learning_rate": 4.12e-06, "loss": 0.0063, "num_tokens": 158702868.0, "reward": 1.251649260520935, "reward_std": 0.15804454684257507, "rewards/accuracy_reward": 0.7102864384651184, "rewards/brier_reward": 0.7943008542060852, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6590039134025574, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 667.4205932617188, "completions/mean_terminated_length": 662.950439453125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.1728, "grad_norm": 0.0003660858201328665, "learning_rate": 4.1e-06, "loss": 0.005, "num_tokens": 160129530.0, "reward": 1.2581586837768555, "reward_std": 0.1472911685705185, "rewards/accuracy_reward": 0.708984375, "rewards/brier_reward": 0.8086214065551758, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6736653447151184, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3305.0, "completions/mean_length": 756.3131713867188, "completions/mean_terminated_length": 747.5933837890625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.1744, "grad_norm": 0.0003455660480540246, "learning_rate": 4.08e-06, "loss": 0.006, "num_tokens": 161704315.0, "reward": 1.2030938863754272, "reward_std": 0.17204484343528748, "rewards/accuracy_reward": 0.6263020634651184, "rewards/brier_reward": 0.7824762463569641, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6723176836967468, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 743.0911865234375, "completions/mean_terminated_length": 738.7196655273438, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.176, "grad_norm": 0.0003964683855883777, "learning_rate": 4.060000000000001e-06, "loss": 0.0047, "num_tokens": 163256423.0, "reward": 1.2116862535476685, "reward_std": 0.17163251340389252, "rewards/accuracy_reward": 0.6458333134651184, "rewards/brier_reward": 0.7794783711433411, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6873763203620911, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3111.0, "completions/mean_length": 668.2838745117188, "completions/mean_terminated_length": 666.05078125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.1776, "grad_norm": 0.0003715313214343041, "learning_rate": 4.04e-06, "loss": 0.0019, "num_tokens": 164694683.0, "reward": 1.3029205799102783, "reward_std": 0.13794218003749847, "rewards/accuracy_reward": 0.7721354365348816, "rewards/brier_reward": 0.8343424797058105, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7081380486488342, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3026.0, "completions/mean_length": 748.748046875, "completions/mean_terminated_length": 744.3839721679688, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.1792, "grad_norm": 0.0003752574266400188, "learning_rate": 4.0200000000000005e-06, "loss": 0.0031, "num_tokens": 166256472.0, "reward": 1.2719032764434814, "reward_std": 0.1808651089668274, "rewards/accuracy_reward": 0.7350260615348816, "rewards/brier_reward": 0.8100683689117432, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7044270634651184, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 698.14453125, "completions/mean_terminated_length": 693.7144775390625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.1808, "grad_norm": 0.00033361720852553844, "learning_rate": 4.000000000000001e-06, "loss": 0.003, "num_tokens": 167736470.0, "reward": 1.2229270935058594, "reward_std": 0.131595641374588, "rewards/accuracy_reward": 0.66015625, "rewards/brier_reward": 0.7869856953620911, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7138671875, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 701.6334838867188, "completions/mean_terminated_length": 692.7708740234375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.1824, "grad_norm": 0.00039658095920458436, "learning_rate": 3.980000000000001e-06, "loss": 0.0023, "num_tokens": 169225091.0, "reward": 1.2004196643829346, "reward_std": 0.1787942349910736, "rewards/accuracy_reward": 0.630859375, "rewards/brier_reward": 0.7725699543952942, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7029622197151184, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3323.0, "completions/mean_length": 701.908203125, "completions/mean_terminated_length": 681.9037475585938, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.184, "grad_norm": 0.00036480152630247176, "learning_rate": 3.96e-06, "loss": 0.0031, "num_tokens": 170698454.0, "reward": 1.303797960281372, "reward_std": 0.1410493701696396, "rewards/accuracy_reward": 0.7864583134651184, "rewards/brier_reward": 0.8269822001457214, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.7303515076637268, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3403.0, "completions/mean_length": 808.70703125, "completions/mean_terminated_length": 791.49609375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.1856, "grad_norm": 0.0004121044185012579, "learning_rate": 3.94e-06, "loss": 0.0042, "num_tokens": 172346004.0, "reward": 1.258159875869751, "reward_std": 0.20867344737052917, "rewards/accuracy_reward": 0.7220051884651184, "rewards/brier_reward": 0.7995083928108215, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.712109386920929, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 774.04296875, "completions/mean_terminated_length": 747.8858032226562, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.1872, "grad_norm": 0.0004193470813333988, "learning_rate": 3.920000000000001e-06, "loss": 0.0067, "num_tokens": 173943126.0, "reward": 1.1080067157745361, "reward_std": 0.18319690227508545, "rewards/accuracy_reward": 0.5045573115348816, "rewards/brier_reward": 0.7212076783180237, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.7042317390441895, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 718.9447021484375, "completions/mean_terminated_length": 716.74462890625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.1888, "grad_norm": 0.00038199377013370395, "learning_rate": 3.900000000000001e-06, "loss": -0.0007, "num_tokens": 175451745.0, "reward": 1.261458396911621, "reward_std": 0.15221717953681946, "rewards/accuracy_reward": 0.7122395634651184, "rewards/brier_reward": 0.8113134503364563, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7258138656616211, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3946.0, "completions/mean_length": 784.900390625, "completions/mean_terminated_length": 765.3850708007812, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1904, "grad_norm": 0.000439609051682055, "learning_rate": 3.88e-06, "loss": 0.004, "num_tokens": 177077576.0, "reward": 1.2126668691635132, "reward_std": 0.1822632998228073, "rewards/accuracy_reward": 0.6549479365348816, "rewards/brier_reward": 0.7775325775146484, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.7323567867279053, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2698.0, "completions/mean_length": 776.6400146484375, "completions/mean_terminated_length": 767.9732666015625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.192, "grad_norm": 0.00046251784078776836, "learning_rate": 3.86e-06, "loss": 0.0051, "num_tokens": 178672671.0, "reward": 1.2551473379135132, "reward_std": 0.17426535487174988, "rewards/accuracy_reward": 0.7122395634651184, "rewards/brier_reward": 0.80064457654953, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7307942509651184, "step": 120 }, { "epoch": 0.192, "eval_completions/clipped_ratio": 0.005859375, "eval_completions/max_length": 3136.25, "eval_completions/max_terminated_length": 2129.625, "eval_completions/mean_length": 730.6034469604492, "eval_completions/mean_terminated_length": 710.7637710571289, "eval_completions/min_length": 269.375, "eval_completions/min_terminated_length": 269.375, "eval_loss": 0.0, "eval_num_tokens": 178672671.0, "eval_reward": 1.2274413257837296, "eval_reward_std": 0.345088642090559, "eval_rewards/accuracy_reward": 0.669921875, "eval_rewards/brier_reward": 0.7908056750893593, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.994140625, "eval_rewards/mean_confidence_reward": 0.7237304747104645, "eval_runtime": 206.8322, "eval_samples_per_second": 4.835, "eval_steps_per_second": 0.039, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3842.0, "completions/mean_length": 717.8190307617188, "completions/mean_terminated_length": 704.5712890625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.1936, "grad_norm": 0.00046462996397167444, "learning_rate": 3.8400000000000005e-06, "loss": 0.0058, "num_tokens": 180193449.0, "reward": 1.2373697757720947, "reward_std": 0.17275390028953552, "rewards/accuracy_reward": 0.6842448115348816, "rewards/brier_reward": 0.7943863868713379, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.728222668170929, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 714.1302490234375, "completions/mean_terminated_length": 709.720947265625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.1952, "grad_norm": 0.00042622009641490877, "learning_rate": 3.820000000000001e-06, "loss": 0.0049, "num_tokens": 181706193.0, "reward": 1.2354695796966553, "reward_std": 0.17324881255626678, "rewards/accuracy_reward": 0.68359375, "rewards/brier_reward": 0.788632869720459, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.73046875, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2770.0, "completions/mean_length": 712.3952026367188, "completions/mean_terminated_length": 707.9837036132812, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.1968, "grad_norm": 0.0003642081283032894, "learning_rate": 3.8000000000000005e-06, "loss": 0.0039, "num_tokens": 183219280.0, "reward": 1.301632285118103, "reward_std": 0.14450380206108093, "rewards/accuracy_reward": 0.7701823115348816, "rewards/brier_reward": 0.8343694806098938, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7392447590827942, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 723.4381713867188, "completions/mean_terminated_length": 716.8382568359375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.1984, "grad_norm": 0.00038551478064619005, "learning_rate": 3.7800000000000002e-06, "loss": 0.0017, "num_tokens": 184738225.0, "reward": 1.274287223815918, "reward_std": 0.16167762875556946, "rewards/accuracy_reward": 0.732421875, "rewards/brier_reward": 0.8180907368659973, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.7431314587593079, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 860.0404052734375, "completions/mean_terminated_length": 843.0982055664062, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.2, "grad_norm": 0.00041836954187601805, "learning_rate": 3.7600000000000004e-06, "loss": 0.0061, "num_tokens": 186454543.0, "reward": 1.198172926902771, "reward_std": 0.20565760135650635, "rewards/accuracy_reward": 0.6328125, "rewards/brier_reward": 0.7693782448768616, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.7186198234558105, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2794.0, "completions/mean_length": 714.6419677734375, "completions/mean_terminated_length": 712.4390869140625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.2016, "grad_norm": 0.0004118600336369127, "learning_rate": 3.74e-06, "loss": 0.0021, "num_tokens": 187968105.0, "reward": 1.273183822631836, "reward_std": 0.13359642028808594, "rewards/accuracy_reward": 0.7311198115348816, "rewards/brier_reward": 0.8158838152885437, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7421548962593079, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 803.7767333984375, "completions/mean_terminated_length": 799.4843139648438, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.2032, "grad_norm": 0.0004159506061114371, "learning_rate": 3.7200000000000004e-06, "loss": 0.0052, "num_tokens": 189616882.0, "reward": 1.1954622268676758, "reward_std": 0.1611214578151703, "rewards/accuracy_reward": 0.6158854365348816, "rewards/brier_reward": 0.7769775390625, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.7234049439430237, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2484.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 698.6849365234375, "completions/mean_terminated_length": 698.6849365234375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.2048, "grad_norm": 0.00038827655953355134, "learning_rate": 3.7e-06, "loss": 0.0045, "num_tokens": 191092526.0, "reward": 1.2807329893112183, "reward_std": 0.12674254179000854, "rewards/accuracy_reward": 0.7415364384651184, "rewards/brier_reward": 0.8199146389961243, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7418836951255798, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3073.0, "completions/mean_length": 704.1119995117188, "completions/mean_terminated_length": 699.689697265625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.2064, "grad_norm": 0.0004101810045540333, "learning_rate": 3.6800000000000003e-06, "loss": 0.0052, "num_tokens": 192580250.0, "reward": 1.218442440032959, "reward_std": 0.1562412828207016, "rewards/accuracy_reward": 0.650390625, "rewards/brier_reward": 0.7877815365791321, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7261393666267395, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 774.1341552734375, "completions/mean_terminated_length": 758.9260864257812, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.208, "grad_norm": 0.0004672614159062505, "learning_rate": 3.66e-06, "loss": 0.0095, "num_tokens": 194191304.0, "reward": 1.2249877452850342, "reward_std": 0.17843040823936462, "rewards/accuracy_reward": 0.662109375, "rewards/brier_reward": 0.7924087643623352, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.7134764790534973, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 661.0325927734375, "completions/mean_terminated_length": 652.06396484375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.2096, "grad_norm": 0.0005046098958700895, "learning_rate": 3.6400000000000003e-06, "loss": 0.0052, "num_tokens": 195604634.0, "reward": 1.2108311653137207, "reward_std": 0.15098439157009125, "rewards/accuracy_reward": 0.6451823115348816, "rewards/brier_reward": 0.7790699005126953, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7204318642616272, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 695.9095458984375, "completions/mean_terminated_length": 693.6944580078125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.2112, "grad_norm": 0.0004189969622530043, "learning_rate": 3.62e-06, "loss": 0.0004, "num_tokens": 197085135.0, "reward": 1.280419111251831, "reward_std": 0.14440715312957764, "rewards/accuracy_reward": 0.7376301884651184, "rewards/brier_reward": 0.8238444328308105, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7199218273162842, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3760.0, "completions/mean_length": 747.7435302734375, "completions/mean_terminated_length": 745.5621948242188, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.2128, "grad_norm": 0.0005624310579150915, "learning_rate": 3.6000000000000003e-06, "loss": 0.003, "num_tokens": 198649797.0, "reward": 1.251842975616455, "reward_std": 0.150738924741745, "rewards/accuracy_reward": 0.697265625, "rewards/brier_reward": 0.8070573210716248, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6994791030883789, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2495.0, "completions/mean_length": 703.6705932617188, "completions/mean_terminated_length": 697.031982421875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.2144, "grad_norm": 0.0004546296549960971, "learning_rate": 3.58e-06, "loss": 0.0045, "num_tokens": 200140139.0, "reward": 1.277174949645996, "reward_std": 0.15649235248565674, "rewards/accuracy_reward": 0.7434895634651184, "rewards/brier_reward": 0.8134503364562988, "rewards/confidence_one_or_zero": 0.0006510416860692203, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7060220837593079, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 728.3900146484375, "completions/mean_terminated_length": 721.7997436523438, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.216, "grad_norm": 0.00041746970964595675, "learning_rate": 3.5600000000000002e-06, "loss": 0.0006, "num_tokens": 201662978.0, "reward": 1.202057123184204, "reward_std": 0.1491517424583435, "rewards/accuracy_reward": 0.6360676884651184, "rewards/brier_reward": 0.7699853777885437, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.70068359375, "step": 135 }, { "epoch": 0.216, "eval_completions/clipped_ratio": 0.001953125, "eval_completions/max_length": 2925.25, "eval_completions/max_terminated_length": 2523.375, "eval_completions/mean_length": 723.9132385253906, "eval_completions/mean_terminated_length": 717.3126678466797, "eval_completions/min_length": 239.25, "eval_completions/min_terminated_length": 239.25, "eval_loss": 0.0, "eval_num_tokens": 201662978.0, "eval_reward": 1.233010932803154, "eval_reward_std": 0.3160964548587799, "eval_rewards/accuracy_reward": 0.666015625, "eval_rewards/brier_reward": 0.801945798099041, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.998046875, "eval_rewards/mean_confidence_reward": 0.6765136793255806, "eval_runtime": 192.5581, "eval_samples_per_second": 5.193, "eval_steps_per_second": 0.042, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3169.0, "completions/max_terminated_length": 3169.0, "completions/mean_length": 727.4603271484375, "completions/mean_terminated_length": 727.4603271484375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.2176, "grad_norm": 0.00038422815850935876, "learning_rate": 3.54e-06, "loss": 0.0007, "num_tokens": 203189733.0, "reward": 1.2340912818908691, "reward_std": 0.1419995129108429, "rewards/accuracy_reward": 0.6595051884651184, "rewards/brier_reward": 0.8086637854576111, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6796548962593079, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4096.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 738.9661865234375, "completions/mean_terminated_length": 738.9661865234375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.2192, "grad_norm": 0.00037418928695842624, "learning_rate": 3.52e-06, "loss": 0.0021, "num_tokens": 204743377.0, "reward": 1.2469637393951416, "reward_std": 0.13167211413383484, "rewards/accuracy_reward": 0.677734375, "rewards/brier_reward": 0.8161800503730774, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6509439945220947, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 677.5794677734375, "completions/mean_terminated_length": 677.5794677734375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.2208, "grad_norm": 0.0003999176842626184, "learning_rate": 3.5e-06, "loss": 0.0004, "num_tokens": 206189131.0, "reward": 1.226180911064148, "reward_std": 0.1387290060520172, "rewards/accuracy_reward": 0.6510416865348816, "rewards/brier_reward": 0.8013069033622742, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6552408933639526, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 643.1100463867188, "completions/mean_terminated_length": 640.860595703125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.2224, "grad_norm": 0.0004304470494389534, "learning_rate": 3.48e-06, "loss": 0.0013, "num_tokens": 207584884.0, "reward": 1.2522257566452026, "reward_std": 0.14682745933532715, "rewards/accuracy_reward": 0.7057291865348816, "rewards/brier_reward": 0.7993603348731995, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.64501953125, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 694.8248901367188, "completions/mean_terminated_length": 694.8248901367188, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.224, "grad_norm": 0.0003110190446022898, "learning_rate": 3.46e-06, "loss": 0.0029, "num_tokens": 209070663.0, "reward": 1.292207956314087, "reward_std": 0.11835722625255585, "rewards/accuracy_reward": 0.759765625, "rewards/brier_reward": 0.8246371150016785, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6553710699081421, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3736.0, "completions/max_terminated_length": 3736.0, "completions/mean_length": 716.3034057617188, "completions/mean_terminated_length": 716.3034057617188, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2256, "grad_norm": 0.0003173888835590333, "learning_rate": 3.44e-06, "loss": 0.0024, "num_tokens": 210589593.0, "reward": 1.2268574237823486, "reward_std": 0.12963539361953735, "rewards/accuracy_reward": 0.6608073115348816, "rewards/brier_reward": 0.794196605682373, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6370573043823242, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3879.0, "completions/mean_length": 684.0084838867188, "completions/mean_terminated_length": 677.3314208984375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.2272, "grad_norm": 0.0003647625562734902, "learning_rate": 3.4200000000000007e-06, "loss": 0.0028, "num_tokens": 212044262.0, "reward": 1.265296220779419, "reward_std": 0.13758443295955658, "rewards/accuracy_reward": 0.7233073115348816, "rewards/brier_reward": 0.8098762631416321, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6520833373069763, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2558.0, "completions/mean_length": 735.2864990234375, "completions/mean_terminated_length": 730.90478515625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.2288, "grad_norm": 0.00031859189039096236, "learning_rate": 3.4000000000000005e-06, "loss": 0.0023, "num_tokens": 213599902.0, "reward": 1.1942328214645386, "reward_std": 0.13861986994743347, "rewards/accuracy_reward": 0.6171875, "rewards/brier_reward": 0.7738688588142395, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6640299558639526, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 693.1107177734375, "completions/mean_terminated_length": 693.1107177734375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.2304, "grad_norm": 0.0003453449171502143, "learning_rate": 3.3800000000000007e-06, "loss": 0.0017, "num_tokens": 215072936.0, "reward": 1.2690236568450928, "reward_std": 0.13904152810573578, "rewards/accuracy_reward": 0.7200520634651184, "rewards/brier_reward": 0.8186328411102295, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6652994751930237, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 704.8001708984375, "completions/mean_terminated_length": 678.0977783203125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.232, "grad_norm": 0.00037236526259221137, "learning_rate": 3.3600000000000004e-06, "loss": 0.0085, "num_tokens": 216576949.0, "reward": 1.2243051528930664, "reward_std": 0.1482827514410019, "rewards/accuracy_reward": 0.6640625, "rewards/brier_reward": 0.7923469543457031, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.6575520634651184, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 658.4407958984375, "completions/mean_terminated_length": 649.4653930664062, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.2336, "grad_norm": 0.00040452994289807975, "learning_rate": 3.3400000000000006e-06, "loss": 0.006, "num_tokens": 217992986.0, "reward": 1.2368491888046265, "reward_std": 0.16234716773033142, "rewards/accuracy_reward": 0.6673176884651184, "rewards/brier_reward": 0.8096224665641785, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.6665363907814026, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 767.7845458984375, "completions/mean_terminated_length": 765.6162719726562, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.2352, "grad_norm": 0.000340078811859712, "learning_rate": 3.3200000000000004e-06, "loss": 0.0033, "num_tokens": 219573967.0, "reward": 1.2452166080474854, "reward_std": 0.15559011697769165, "rewards/accuracy_reward": 0.6770833134651184, "rewards/brier_reward": 0.8139875531196594, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6592448353767395, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3422.0, "completions/mean_length": 716.21875, "completions/mean_terminated_length": 714.0169067382812, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.2368, "grad_norm": 0.0003522939223330468, "learning_rate": 3.3000000000000006e-06, "loss": 0.0025, "num_tokens": 221088639.0, "reward": 1.3024237155914307, "reward_std": 0.16475433111190796, "rewards/accuracy_reward": 0.7708333134651184, "rewards/brier_reward": 0.8346517086029053, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6688150763511658, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3243.0, "completions/mean_length": 669.4779052734375, "completions/mean_terminated_length": 665.0104370117188, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.2384, "grad_norm": 0.0003961655020248145, "learning_rate": 3.2800000000000004e-06, "loss": 0.0027, "num_tokens": 222525565.0, "reward": 1.2096707820892334, "reward_std": 0.15979653596878052, "rewards/accuracy_reward": 0.62109375, "rewards/brier_reward": 0.7995361685752869, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6741211414337158, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3242.0, "completions/mean_length": 740.951171875, "completions/mean_terminated_length": 732.1912841796875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.24, "grad_norm": 0.00032879505306482315, "learning_rate": 3.2600000000000006e-06, "loss": 0.0067, "num_tokens": 224086674.0, "reward": 1.2454150915145874, "reward_std": 0.1362927407026291, "rewards/accuracy_reward": 0.6647135615348816, "rewards/brier_reward": 0.8293587565422058, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.653515636920929, "step": 150 }, { "epoch": 0.24, "eval_completions/clipped_ratio": 0.0009765625, "eval_completions/max_length": 2459.75, "eval_completions/max_terminated_length": 2187.0, "eval_completions/mean_length": 699.432243347168, "eval_completions/mean_terminated_length": 696.1262283325195, "eval_completions/min_length": 240.0, "eval_completions/min_terminated_length": 240.0, "eval_loss": 0.0, "eval_num_tokens": 224086674.0, "eval_reward": 1.2464326173067093, "eval_reward_std": 0.3137114681303501, "eval_rewards/accuracy_reward": 0.685546875, "eval_rewards/brier_reward": 0.8092578127980232, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.998046875, "eval_rewards/mean_confidence_reward": 0.6809570342302322, "eval_runtime": 163.2584, "eval_samples_per_second": 6.125, "eval_steps_per_second": 0.049, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 706.3470458984375, "completions/mean_terminated_length": 677.4136962890625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.2416, "grad_norm": 0.00035863128141500056, "learning_rate": 3.2400000000000003e-06, "loss": 0.0091, "num_tokens": 225578791.0, "reward": 1.2492650747299194, "reward_std": 0.15215274691581726, "rewards/accuracy_reward": 0.6946614384651184, "rewards/brier_reward": 0.8162246346473694, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9876301884651184, "rewards/mean_confidence_reward": 0.6784310340881348, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 700.740234375, "completions/mean_terminated_length": 698.5283203125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.2432, "grad_norm": 0.00031168534769676626, "learning_rate": 3.2200000000000005e-06, "loss": 0.0027, "num_tokens": 227066616.0, "reward": 1.295917272567749, "reward_std": 0.1307864487171173, "rewards/accuracy_reward": 0.763671875, "rewards/brier_reward": 0.8287995457649231, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.695397138595581, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 683.7662963867188, "completions/mean_terminated_length": 677.0887451171875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.2448, "grad_norm": 0.00035393788130022585, "learning_rate": 3.2000000000000003e-06, "loss": 0.0015, "num_tokens": 228524721.0, "reward": 1.2801485061645508, "reward_std": 0.1461012363433838, "rewards/accuracy_reward": 0.7317708134651184, "rewards/brier_reward": 0.8304653167724609, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6927148699760437, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 673.4127807617188, "completions/mean_terminated_length": 664.4765014648438, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.2464, "grad_norm": 0.0003325925499666482, "learning_rate": 3.1800000000000005e-06, "loss": 0.0032, "num_tokens": 229963243.0, "reward": 1.281780481338501, "reward_std": 0.1279451698064804, "rewards/accuracy_reward": 0.7389323115348816, "rewards/brier_reward": 0.8272189497947693, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6818358302116394, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3340.0, "completions/mean_length": 691.763671875, "completions/mean_terminated_length": 689.5458984375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.248, "grad_norm": 0.0003599083283916116, "learning_rate": 3.1600000000000002e-06, "loss": 0.003, "num_tokens": 231437664.0, "reward": 1.2404148578643799, "reward_std": 0.15770293772220612, "rewards/accuracy_reward": 0.6770833134651184, "rewards/brier_reward": 0.8043830990791321, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7010743021965027, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00520833333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3998.0, "completions/mean_length": 804.265625, "completions/mean_terminated_length": 787.0314331054688, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2496, "grad_norm": 0.00029416769393719733, "learning_rate": 3.1400000000000004e-06, "loss": 0.0075, "num_tokens": 233086264.0, "reward": 1.1881619691848755, "reward_std": 0.15146377682685852, "rewards/accuracy_reward": 0.599609375, "rewards/brier_reward": 0.783862292766571, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.6775715947151184, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2777.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 725.765625, "completions/mean_terminated_length": 725.765625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.2512, "grad_norm": 0.0003722517576534301, "learning_rate": 3.12e-06, "loss": -0.0008, "num_tokens": 234618640.0, "reward": 1.1860971450805664, "reward_std": 0.1709679216146469, "rewards/accuracy_reward": 0.5950520634651184, "rewards/brier_reward": 0.7771281599998474, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6954372525215149, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3477.0, "completions/mean_length": 630.4622802734375, "completions/mean_terminated_length": 625.9439086914062, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.2528, "grad_norm": 0.0003852552908938378, "learning_rate": 3.1000000000000004e-06, "loss": 0.0033, "num_tokens": 235996246.0, "reward": 1.2188093662261963, "reward_std": 0.16556227207183838, "rewards/accuracy_reward": 0.6516926884651184, "rewards/brier_reward": 0.7904689908027649, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.7202755808830261, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4008.0, "completions/mean_length": 742.7702026367188, "completions/mean_terminated_length": 729.6203002929688, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.2544, "grad_norm": 0.0003424190217629075, "learning_rate": 3.08e-06, "loss": 0.0066, "num_tokens": 237552821.0, "reward": 1.2825796604156494, "reward_std": 0.1687416136264801, "rewards/accuracy_reward": 0.7467448115348816, "rewards/brier_reward": 0.8236086964607239, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.6970138549804688, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 769.2037963867188, "completions/mean_terminated_length": 764.8663330078125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.256, "grad_norm": 0.00025332788936793804, "learning_rate": 3.0600000000000003e-06, "loss": 0.0043, "num_tokens": 239148974.0, "reward": 1.2768299579620361, "reward_std": 0.13337822258472443, "rewards/accuracy_reward": 0.7272135615348816, "rewards/brier_reward": 0.8283855319023132, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6939083933830261, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2901.0, "completions/mean_length": 698.9850463867188, "completions/mean_terminated_length": 690.1155395507812, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.2576, "grad_norm": 0.0003336727968417108, "learning_rate": 3.04e-06, "loss": 0.0043, "num_tokens": 240637687.0, "reward": 1.2580978870391846, "reward_std": 0.13903328776359558, "rewards/accuracy_reward": 0.69140625, "rewards/brier_reward": 0.8273795247077942, "rewards/confidence_one_or_zero": 0.0006510416860692203, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6982421875, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 736.453125, "completions/mean_terminated_length": 721.0725708007812, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.2592, "grad_norm": 0.0003266559215262532, "learning_rate": 3.0200000000000003e-06, "loss": 0.0072, "num_tokens": 242177039.0, "reward": 1.2972761392593384, "reward_std": 0.14918260276317596, "rewards/accuracy_reward": 0.7486979365348816, "rewards/brier_reward": 0.8503971695899963, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.7184244990348816, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 721.49609375, "completions/mean_terminated_length": 708.2627563476562, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.2608, "grad_norm": 0.0002870375756174326, "learning_rate": 3e-06, "loss": 0.005, "num_tokens": 243691081.0, "reward": 1.2882651090621948, "reward_std": 0.1163727268576622, "rewards/accuracy_reward": 0.73828125, "rewards/brier_reward": 0.8427920937538147, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.6999521851539612, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 691.5443115234375, "completions/mean_terminated_length": 691.5443115234375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.2624, "grad_norm": 0.0003439621941652149, "learning_rate": 2.9800000000000003e-06, "loss": 0.0003, "num_tokens": 245170349.0, "reward": 1.2349555492401123, "reward_std": 0.16181586682796478, "rewards/accuracy_reward": 0.6744791865348816, "rewards/brier_reward": 0.7954173684120178, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7181184887886047, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3965.0, "completions/mean_length": 674.1302490234375, "completions/mean_terminated_length": 665.1958618164062, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.264, "grad_norm": 0.0003700863162521273, "learning_rate": 2.96e-06, "loss": 0.0031, "num_tokens": 246632085.0, "reward": 1.267493486404419, "reward_std": 0.16339465975761414, "rewards/accuracy_reward": 0.7194010615348816, "rewards/brier_reward": 0.8188264966011047, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.7181965708732605, "step": 165 }, { "epoch": 0.264, "eval_completions/clipped_ratio": 0.0009765625, "eval_completions/max_length": 2090.5, "eval_completions/max_terminated_length": 2019.5, "eval_completions/mean_length": 693.4591369628906, "eval_completions/mean_terminated_length": 690.1595764160156, "eval_completions/min_length": 238.625, "eval_completions/min_terminated_length": 238.625, "eval_loss": 0.0, "eval_num_tokens": 246632085.0, "eval_reward": 1.2449833005666733, "eval_reward_std": 0.3172280713915825, "eval_rewards/accuracy_reward": 0.677734375, "eval_rewards/brier_reward": 0.8131947442889214, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9990234375, "eval_rewards/mean_confidence_reward": 0.7019368261098862, "eval_runtime": 139.2657, "eval_samples_per_second": 7.181, "eval_steps_per_second": 0.057, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 670.53125, "completions/mean_terminated_length": 670.53125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.2656, "grad_norm": 0.00030478346161544323, "learning_rate": 2.9400000000000002e-06, "loss": 0.0017, "num_tokens": 248068677.0, "reward": 1.1726031303405762, "reward_std": 0.13403402268886566, "rewards/accuracy_reward": 0.5930989384651184, "rewards/brier_reward": 0.7520926594734192, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7226996421813965, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3551.0, "completions/mean_length": 676.9127807617188, "completions/mean_terminated_length": 667.9856567382812, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.2672, "grad_norm": 0.00030173835693858564, "learning_rate": 2.92e-06, "loss": 0.0044, "num_tokens": 249510431.0, "reward": 1.3023178577423096, "reward_std": 0.13932368159294128, "rewards/accuracy_reward": 0.7825520634651184, "rewards/brier_reward": 0.8246728777885437, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7281575202941895, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 798.9407958984375, "completions/mean_terminated_length": 751.0310668945312, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.2688, "grad_norm": 0.0002671535185072571, "learning_rate": 2.9e-06, "loss": 0.008, "num_tokens": 251147428.0, "reward": 1.2024803161621094, "reward_std": 0.14870837330818176, "rewards/accuracy_reward": 0.62109375, "rewards/brier_reward": 0.8014317154884338, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.982421875, "rewards/mean_confidence_reward": 0.66141277551651, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 775.6569213867188, "completions/mean_terminated_length": 749.512451171875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2704, "grad_norm": 0.0009601937490515411, "learning_rate": 2.88e-06, "loss": 0.0071, "num_tokens": 252760149.0, "reward": 1.1882447004318237, "reward_std": 0.1685955971479416, "rewards/accuracy_reward": 0.5989583134651184, "rewards/brier_reward": 0.7872835993766785, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.6479253172874451, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3932.0, "completions/mean_length": 773.849609375, "completions/mean_terminated_length": 771.685302734375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.272, "grad_norm": 0.0003097024455200881, "learning_rate": 2.86e-06, "loss": 0.0054, "num_tokens": 254366286.0, "reward": 1.2390365600585938, "reward_std": 0.1697988212108612, "rewards/accuracy_reward": 0.6647135615348816, "rewards/brier_reward": 0.8146483898162842, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6590494513511658, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3989.0, "completions/mean_length": 694.6517333984375, "completions/mean_terminated_length": 687.9954223632812, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.2736, "grad_norm": 0.00033000909024849534, "learning_rate": 2.84e-06, "loss": 0.0078, "num_tokens": 255833239.0, "reward": 1.3064475059509277, "reward_std": 0.1525462418794632, "rewards/accuracy_reward": 0.7649739384651184, "rewards/brier_reward": 0.8505108952522278, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7028646469116211, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2633.0, "completions/mean_length": 696.0202026367188, "completions/mean_terminated_length": 689.3666381835938, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.2752, "grad_norm": 0.00030296496697701514, "learning_rate": 2.82e-06, "loss": 0.0035, "num_tokens": 257315958.0, "reward": 1.2318323850631714, "reward_std": 0.1538444459438324, "rewards/accuracy_reward": 0.669921875, "rewards/brier_reward": 0.7956822514533997, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6737629771232605, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3084.0, "completions/mean_length": 745.0423583984375, "completions/mean_terminated_length": 742.8592529296875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.2768, "grad_norm": 0.0002990367647726089, "learning_rate": 2.8000000000000003e-06, "loss": 0.0035, "num_tokens": 258880311.0, "reward": 1.2601661682128906, "reward_std": 0.14305296540260315, "rewards/accuracy_reward": 0.6829426884651184, "rewards/brier_reward": 0.83802729845047, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6590494513511658, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 663.1126708984375, "completions/mean_terminated_length": 647.3963012695312, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.2784, "grad_norm": 0.0003260111843701452, "learning_rate": 2.7800000000000005e-06, "loss": 0.0082, "num_tokens": 260306884.0, "reward": 1.3131041526794434, "reward_std": 0.1533985286951065, "rewards/accuracy_reward": 0.7884114384651184, "rewards/brier_reward": 0.84364253282547, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.6766276359558105, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2169.0, "completions/max_terminated_length": 2169.0, "completions/mean_length": 691.1334838867188, "completions/mean_terminated_length": 691.1334838867188, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.28, "grad_norm": 0.0003225924738217145, "learning_rate": 2.7600000000000003e-06, "loss": -0.0013, "num_tokens": 261776753.0, "reward": 1.2730109691619873, "reward_std": 0.14922770857810974, "rewards/accuracy_reward": 0.7252604365348816, "rewards/brier_reward": 0.8207475543022156, "rewards/confidence_one_or_zero": 0.0006510416860692203, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6841991543769836, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2157.0, "completions/mean_length": 687.0384521484375, "completions/mean_terminated_length": 682.5938720703125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.2816, "grad_norm": 0.0003303214907646179, "learning_rate": 2.7400000000000004e-06, "loss": 0.0047, "num_tokens": 263245580.0, "reward": 1.2377278804779053, "reward_std": 0.1398918330669403, "rewards/accuracy_reward": 0.6673176884651184, "rewards/brier_reward": 0.8094267249107361, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6635026335716248, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3086.0, "completions/mean_length": 766.625, "completions/mean_terminated_length": 760.109619140625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.2832, "grad_norm": 0.0002739731571637094, "learning_rate": 2.7200000000000002e-06, "loss": 0.0041, "num_tokens": 264836332.0, "reward": 1.2218523025512695, "reward_std": 0.14854612946510315, "rewards/accuracy_reward": 0.6490885615348816, "rewards/brier_reward": 0.7965560555458069, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6456770896911621, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3939.0, "completions/mean_length": 711.3529052734375, "completions/mean_terminated_length": 704.7293090820312, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.2848, "grad_norm": 0.0002777015033643693, "learning_rate": 2.7000000000000004e-06, "loss": 0.0042, "num_tokens": 266340618.0, "reward": 1.2438610792160034, "reward_std": 0.14774654805660248, "rewards/accuracy_reward": 0.6803385615348816, "rewards/brier_reward": 0.8099741339683533, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6761263012886047, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3920.0, "completions/mean_length": 684.2213745117188, "completions/mean_terminated_length": 681.9986572265625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.2864, "grad_norm": 0.0002459449751768261, "learning_rate": 2.68e-06, "loss": 0.0031, "num_tokens": 267793822.0, "reward": 1.2035410404205322, "reward_std": 0.12963557243347168, "rewards/accuracy_reward": 0.6236979365348816, "rewards/brier_reward": 0.7846729159355164, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6600781083106995, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3402.0, "completions/mean_length": 811.6673583984375, "completions/mean_terminated_length": 792.3097534179688, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.288, "grad_norm": 0.00022686159354634583, "learning_rate": 2.6600000000000004e-06, "loss": 0.0056, "num_tokens": 269453759.0, "reward": 1.2652864456176758, "reward_std": 0.12721675634384155, "rewards/accuracy_reward": 0.703125, "rewards/brier_reward": 0.8332950472831726, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.6138672232627869, "step": 180 }, { "epoch": 0.288, "eval_completions/clipped_ratio": 0.001953125, "eval_completions/max_length": 2761.0, "eval_completions/max_terminated_length": 2190.125, "eval_completions/mean_length": 731.758415222168, "eval_completions/mean_terminated_length": 725.1538925170898, "eval_completions/min_length": 245.5, "eval_completions/min_terminated_length": 245.5, "eval_loss": 0.0, "eval_num_tokens": 269453759.0, "eval_reward": 1.2450296878814697, "eval_reward_std": 0.30219918116927147, "eval_rewards/accuracy_reward": 0.677734375, "eval_rewards/brier_reward": 0.815241701900959, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9970703125, "eval_rewards/mean_confidence_reward": 0.6520019620656967, "eval_runtime": 180.3752, "eval_samples_per_second": 5.544, "eval_steps_per_second": 0.044, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3162.0, "completions/mean_length": 751.7493896484375, "completions/mean_terminated_length": 743.0176391601562, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.2896, "grad_norm": 0.0002296247548656538, "learning_rate": 2.64e-06, "loss": 0.0047, "num_tokens": 271021022.0, "reward": 1.2494231462478638, "reward_std": 0.13265298306941986, "rewards/accuracy_reward": 0.6861979365348816, "rewards/brier_reward": 0.8152399659156799, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6269010901451111, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3347.0, "completions/mean_length": 727.970703125, "completions/mean_terminated_length": 716.9712524414062, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.2912, "grad_norm": 0.00030195244471542537, "learning_rate": 2.6200000000000003e-06, "loss": 0.0033, "num_tokens": 272558225.0, "reward": 1.241723656654358, "reward_std": 0.14886151254177094, "rewards/accuracy_reward": 0.6751301884651184, "rewards/brier_reward": 0.8122102618217468, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6455989480018616, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3987.0, "completions/mean_length": 669.4147338867188, "completions/mean_terminated_length": 664.9472045898438, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.2928, "grad_norm": 0.00026114785578101873, "learning_rate": 2.6e-06, "loss": 0.002, "num_tokens": 274008942.0, "reward": 1.2621148824691772, "reward_std": 0.12769341468811035, "rewards/accuracy_reward": 0.6966145634651184, "rewards/brier_reward": 0.8295550346374512, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6602083444595337, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2405.0, "completions/mean_length": 720.4544677734375, "completions/mean_terminated_length": 716.053466796875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.2944, "grad_norm": 0.00028812006348744035, "learning_rate": 2.5800000000000003e-06, "loss": 0.0022, "num_tokens": 275531240.0, "reward": 1.2271807193756104, "reward_std": 0.15855127573013306, "rewards/accuracy_reward": 0.6549479365348816, "rewards/brier_reward": 0.8007020354270935, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.682324230670929, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3809.0, "completions/mean_length": 710.8600463867188, "completions/mean_terminated_length": 704.2354736328125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.296, "grad_norm": 0.0002796650805976242, "learning_rate": 2.56e-06, "loss": 0.0029, "num_tokens": 277040401.0, "reward": 1.2891216278076172, "reward_std": 0.13483284413814545, "rewards/accuracy_reward": 0.7447916865348816, "rewards/brier_reward": 0.8353909850120544, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6749935150146484, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 766.0651245117188, "completions/mean_terminated_length": 761.7235717773438, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2976, "grad_norm": 0.00030117694404907525, "learning_rate": 2.5400000000000002e-06, "loss": 0.0026, "num_tokens": 278631477.0, "reward": 1.2256301641464233, "reward_std": 0.17254416644573212, "rewards/accuracy_reward": 0.6373698115348816, "rewards/brier_reward": 0.8151800632476807, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6263541579246521, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3290.0, "completions/mean_length": 750.0111083984375, "completions/mean_terminated_length": 736.8895874023438, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.2992, "grad_norm": 0.0002885512658394873, "learning_rate": 2.52e-06, "loss": 0.0053, "num_tokens": 280203430.0, "reward": 1.2685060501098633, "reward_std": 0.15959160029888153, "rewards/accuracy_reward": 0.7102864384651184, "rewards/brier_reward": 0.8306184411048889, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6720051765441895, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 799.8060302734375, "completions/mean_terminated_length": 780.3785400390625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.3008, "grad_norm": 0.00028348705382086337, "learning_rate": 2.5e-06, "loss": 0.0062, "num_tokens": 281835100.0, "reward": 1.23994779586792, "reward_std": 0.16003717482089996, "rewards/accuracy_reward": 0.6822916865348816, "rewards/brier_reward": 0.8034496307373047, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.6760937571525574, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 717.9140625, "completions/mean_terminated_length": 711.3033447265625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.3024, "grad_norm": 0.00032843186636455357, "learning_rate": 2.4800000000000004e-06, "loss": 0.007, "num_tokens": 283350968.0, "reward": 1.2918379306793213, "reward_std": 0.17009499669075012, "rewards/accuracy_reward": 0.7513020634651184, "rewards/brier_reward": 0.834313154220581, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6832030415534973, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2940.0, "completions/mean_length": 780.9401245117188, "completions/mean_terminated_length": 770.1136474609375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.304, "grad_norm": 0.0003035848494619131, "learning_rate": 2.46e-06, "loss": 0.0073, "num_tokens": 284963132.0, "reward": 1.2325888872146606, "reward_std": 0.16241192817687988, "rewards/accuracy_reward": 0.6569010615348816, "rewards/brier_reward": 0.8115188479423523, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.6490950584411621, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 792.2135620117188, "completions/mean_terminated_length": 783.5874633789062, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.3056, "grad_norm": 0.00029277699650265276, "learning_rate": 2.4400000000000004e-06, "loss": 0.0015, "num_tokens": 286608900.0, "reward": 1.2141036987304688, "reward_std": 0.14805182814598083, "rewards/accuracy_reward": 0.6321614384651184, "rewards/brier_reward": 0.7986369729042053, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6557682156562805, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3623.0, "completions/mean_length": 806.8854370117188, "completions/mean_terminated_length": 800.4487915039062, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.3072, "grad_norm": 0.00025400330196134746, "learning_rate": 2.42e-06, "loss": 0.0033, "num_tokens": 288269012.0, "reward": 1.2122504711151123, "reward_std": 0.14429521560668945, "rewards/accuracy_reward": 0.6217448115348816, "rewards/brier_reward": 0.8046958446502686, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6713671684265137, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1906.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 692.6705932617188, "completions/mean_terminated_length": 692.6705932617188, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.3088, "grad_norm": 0.00025946690584532917, "learning_rate": 2.4000000000000003e-06, "loss": 0.0008, "num_tokens": 289751162.0, "reward": 1.3152220249176025, "reward_std": 0.12808707356452942, "rewards/accuracy_reward": 0.7682291865348816, "rewards/brier_reward": 0.8622004985809326, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7031901478767395, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 724.3541870117188, "completions/mean_terminated_length": 722.1576538085938, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.3104, "grad_norm": 0.000303194421576336, "learning_rate": 2.38e-06, "loss": 0.0034, "num_tokens": 291276538.0, "reward": 1.2516835927963257, "reward_std": 0.15960410237312317, "rewards/accuracy_reward": 0.6901041865348816, "rewards/brier_reward": 0.8139000535011292, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7021288871765137, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 732.6595458984375, "completions/mean_terminated_length": 730.4683837890625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.312, "grad_norm": 0.0003036792913917452, "learning_rate": 2.3600000000000003e-06, "loss": 0.0029, "num_tokens": 292813359.0, "reward": 1.2197331190109253, "reward_std": 0.1738263964653015, "rewards/accuracy_reward": 0.642578125, "rewards/brier_reward": 0.798176109790802, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6941927075386047, "step": 195 }, { "epoch": 0.312, "eval_completions/clipped_ratio": 0.0029296875, "eval_completions/max_length": 2943.25, "eval_completions/max_terminated_length": 2129.625, "eval_completions/mean_length": 744.2492523193359, "eval_completions/mean_terminated_length": 734.3755950927734, "eval_completions/min_length": 247.625, "eval_completions/min_terminated_length": 247.625, "eval_loss": 0.0, "eval_num_tokens": 292813359.0, "eval_reward": 1.2508796155452728, "eval_reward_std": 0.3142912834882736, "eval_rewards/accuracy_reward": 0.6865234375, "eval_rewards/brier_reward": 0.8181517645716667, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9970703125, "eval_rewards/mean_confidence_reward": 0.6857031211256981, "eval_runtime": 193.4223, "eval_samples_per_second": 5.17, "eval_steps_per_second": 0.041, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3947.0, "completions/max_terminated_length": 3947.0, "completions/mean_length": 773.689453125, "completions/mean_terminated_length": 773.689453125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.3136, "grad_norm": 0.0002566716284491122, "learning_rate": 2.3400000000000005e-06, "loss": 0.0015, "num_tokens": 294422450.0, "reward": 1.2534570693969727, "reward_std": 0.13306227326393127, "rewards/accuracy_reward": 0.689453125, "rewards/brier_reward": 0.8180981278419495, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6880989670753479, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 692.97265625, "completions/mean_terminated_length": 688.5358276367188, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.3152, "grad_norm": 0.0003281522949691862, "learning_rate": 2.3200000000000002e-06, "loss": 0.0017, "num_tokens": 295896872.0, "reward": 1.2341904640197754, "reward_std": 0.16633640229701996, "rewards/accuracy_reward": 0.6516926884651184, "rewards/brier_reward": 0.8179764747619629, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6840038895606995, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 744.1849365234375, "completions/mean_terminated_length": 739.8148193359375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.3168, "grad_norm": 0.0002939295372925699, "learning_rate": 2.3000000000000004e-06, "loss": 0.002, "num_tokens": 297442052.0, "reward": 1.2711775302886963, "reward_std": 0.15218901634216309, "rewards/accuracy_reward": 0.7115885615348816, "rewards/brier_reward": 0.8320547938346863, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6780142784118652, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 810.4655151367188, "completions/mean_terminated_length": 806.181884765625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.3184, "grad_norm": 0.00023884234542492777, "learning_rate": 2.28e-06, "loss": 0.0045, "num_tokens": 299104911.0, "reward": 1.2427078485488892, "reward_std": 0.14226338267326355, "rewards/accuracy_reward": 0.6640625, "rewards/brier_reward": 0.8226415514945984, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.678417980670929, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1945.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 688.955078125, "completions/mean_terminated_length": 688.955078125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.32, "grad_norm": 0.0002637612633407116, "learning_rate": 2.2600000000000004e-06, "loss": 0.0031, "num_tokens": 300577482.0, "reward": 1.2900599241256714, "reward_std": 0.13062447309494019, "rewards/accuracy_reward": 0.7376301884651184, "rewards/brier_reward": 0.8424755930900574, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6950846314430237, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3723.0, "completions/mean_length": 735.4622802734375, "completions/mean_terminated_length": 728.8858642578125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.3216, "grad_norm": 0.00026520894607529044, "learning_rate": 2.24e-06, "loss": 0.0055, "num_tokens": 302129008.0, "reward": 1.283205270767212, "reward_std": 0.1325000375509262, "rewards/accuracy_reward": 0.71875, "rewards/brier_reward": 0.8502506613731384, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6880208849906921, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 693.8014526367188, "completions/mean_terminated_length": 689.36572265625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.3232, "grad_norm": 0.00030505601898767054, "learning_rate": 2.2200000000000003e-06, "loss": 0.0044, "num_tokens": 303599807.0, "reward": 1.2173573970794678, "reward_std": 0.15380731225013733, "rewards/accuracy_reward": 0.638671875, "rewards/brier_reward": 0.7973307967185974, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6977214217185974, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 714.9193115234375, "completions/mean_terminated_length": 710.5110473632812, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.3248, "grad_norm": 0.00030023083672858775, "learning_rate": 2.2e-06, "loss": 0.003, "num_tokens": 305106435.0, "reward": 1.2466018199920654, "reward_std": 0.15282773971557617, "rewards/accuracy_reward": 0.6809895634651184, "rewards/brier_reward": 0.8135018944740295, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7095920443534851, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 703.67578125, "completions/mean_terminated_length": 701.4657592773438, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.3264, "grad_norm": 0.00033823170815594494, "learning_rate": 2.1800000000000003e-06, "loss": 0.002, "num_tokens": 306588561.0, "reward": 1.2991374731063843, "reward_std": 0.17175593972206116, "rewards/accuracy_reward": 0.75390625, "rewards/brier_reward": 0.8450052738189697, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7182747721672058, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 719.1732177734375, "completions/mean_terminated_length": 716.9732666015625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.328, "grad_norm": 0.0003019451105501503, "learning_rate": 2.16e-06, "loss": 0.0034, "num_tokens": 308104763.0, "reward": 1.287405014038086, "reward_std": 0.11788296699523926, "rewards/accuracy_reward": 0.736328125, "rewards/brier_reward": 0.8391183018684387, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7293685078620911, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 721.57421875, "completions/mean_terminated_length": 719.3758544921875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3296, "grad_norm": 0.00028889725217595696, "learning_rate": 2.1400000000000003e-06, "loss": 0.0002, "num_tokens": 309635117.0, "reward": 1.2466142177581787, "reward_std": 0.15450644493103027, "rewards/accuracy_reward": 0.6764323115348816, "rewards/brier_reward": 0.817432701587677, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.716393232345581, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 719.0631713867188, "completions/mean_terminated_length": 712.4546508789062, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3312, "grad_norm": 0.00033217977033928037, "learning_rate": 2.12e-06, "loss": 0.0033, "num_tokens": 311163470.0, "reward": 1.2038352489471436, "reward_std": 0.167741060256958, "rewards/accuracy_reward": 0.62890625, "rewards/brier_reward": 0.780703067779541, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6936848759651184, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 690.4759521484375, "completions/mean_terminated_length": 663.6607666015625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.3328, "grad_norm": 0.0002878858067560941, "learning_rate": 2.1000000000000002e-06, "loss": 0.0089, "num_tokens": 312632201.0, "reward": 1.2503248453140259, "reward_std": 0.14695434272289276, "rewards/accuracy_reward": 0.6868489384651184, "rewards/brier_reward": 0.8215987086296082, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.7232552170753479, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3402.0, "completions/mean_length": 741.9069213867188, "completions/mean_terminated_length": 737.5338745117188, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.3344, "grad_norm": 0.00030193888233043253, "learning_rate": 2.08e-06, "loss": 0.0043, "num_tokens": 314182266.0, "reward": 1.2189598083496094, "reward_std": 0.1613154113292694, "rewards/accuracy_reward": 0.6263020634651184, "rewards/brier_reward": 0.8135563731193542, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.7068359851837158, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2120.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 751.2116088867188, "completions/mean_terminated_length": 751.2116088867188, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.336, "grad_norm": 0.0002683236089069396, "learning_rate": 2.06e-06, "loss": -0.0002, "num_tokens": 315755359.0, "reward": 1.2340011596679688, "reward_std": 0.14506465196609497, "rewards/accuracy_reward": 0.6380208134651184, "rewards/brier_reward": 0.8299677968025208, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6825000643730164, "step": 210 }, { "epoch": 0.336, "eval_completions/clipped_ratio": 0.0029296875, "eval_completions/max_length": 2924.625, "eval_completions/max_terminated_length": 2258.875, "eval_completions/mean_length": 746.1707534790039, "eval_completions/mean_terminated_length": 736.3797302246094, "eval_completions/min_length": 244.625, "eval_completions/min_terminated_length": 244.625, "eval_loss": 0.0, "eval_num_tokens": 315755359.0, "eval_reward": 1.2560944259166718, "eval_reward_std": 0.32324112951755524, "eval_rewards/accuracy_reward": 0.6875, "eval_rewards/brier_reward": 0.8276041150093079, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9970703125, "eval_rewards/mean_confidence_reward": 0.7178515791893005, "eval_runtime": 191.4209, "eval_samples_per_second": 5.224, "eval_steps_per_second": 0.042, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2262.0, "completions/mean_length": 754.1784057617188, "completions/mean_terminated_length": 745.4530029296875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.3376, "grad_norm": 0.0002631341922096908, "learning_rate": 2.04e-06, "loss": 0.0037, "num_tokens": 317315249.0, "reward": 1.280827522277832, "reward_std": 0.13670861721038818, "rewards/accuracy_reward": 0.712890625, "rewards/brier_reward": 0.8513541221618652, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7143229842185974, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 692.11328125, "completions/mean_terminated_length": 687.6753540039062, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3392, "grad_norm": 0.00034236942883580923, "learning_rate": 2.02e-06, "loss": 0.004, "num_tokens": 318781055.0, "reward": 1.3246086835861206, "reward_std": 0.16596822440624237, "rewards/accuracy_reward": 0.7962239384651184, "rewards/brier_reward": 0.8542806506156921, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7417318224906921, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 714.7376708984375, "completions/mean_terminated_length": 705.9093017578125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.3408, "grad_norm": 0.0002729625266510993, "learning_rate": 2.0000000000000003e-06, "loss": 0.0054, "num_tokens": 320293004.0, "reward": 1.2722610235214233, "reward_std": 0.15230190753936768, "rewards/accuracy_reward": 0.7044270634651184, "rewards/brier_reward": 0.8426845073699951, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.71937495470047, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 621.7252807617188, "completions/mean_terminated_length": 621.7252807617188, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.3424, "grad_norm": 0.0003270265005994588, "learning_rate": 1.98e-06, "loss": -0.0002, "num_tokens": 321646726.0, "reward": 1.293480396270752, "reward_std": 0.10694856941699982, "rewards/accuracy_reward": 0.7389323115348816, "rewards/brier_reward": 0.8480134010314941, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7458203434944153, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 753.6751708984375, "completions/mean_terminated_length": 751.4976806640625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.344, "grad_norm": 0.0003029298968613148, "learning_rate": 1.9600000000000003e-06, "loss": 0.002, "num_tokens": 323219699.0, "reward": 1.2616493701934814, "reward_std": 0.1534969061613083, "rewards/accuracy_reward": 0.6940104365348816, "rewards/brier_reward": 0.8299252390861511, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.707115888595581, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3633.0, "completions/mean_length": 766.654296875, "completions/mean_terminated_length": 762.3135375976562, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.3456, "grad_norm": 0.00027078299899585545, "learning_rate": 1.94e-06, "loss": 0.0003, "num_tokens": 324812032.0, "reward": 1.2803882360458374, "reward_std": 0.14616751670837402, "rewards/accuracy_reward": 0.7337239384651184, "rewards/brier_reward": 0.8289908766746521, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.7239583134651184, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2760.0, "completions/mean_length": 718.95703125, "completions/mean_terminated_length": 714.5540771484375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.3472, "grad_norm": 0.00030023674480617046, "learning_rate": 1.9200000000000003e-06, "loss": 0.0031, "num_tokens": 326324414.0, "reward": 1.2623405456542969, "reward_std": 0.13667868077754974, "rewards/accuracy_reward": 0.7063801884651184, "rewards/brier_reward": 0.8195882439613342, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7281575798988342, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 730.92578125, "completions/mean_terminated_length": 728.7335205078125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.3488, "grad_norm": 0.0005918736569583416, "learning_rate": 1.9000000000000002e-06, "loss": 0.0014, "num_tokens": 327872172.0, "reward": 1.1902053356170654, "reward_std": 0.14602167904376984, "rewards/accuracy_reward": 0.6022135615348816, "rewards/brier_reward": 0.7788329720497131, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7478190064430237, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 802.609375, "completions/mean_terminated_length": 796.1644287109375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.3504, "grad_norm": 0.00027477252297103405, "learning_rate": 1.8800000000000002e-06, "loss": 0.0025, "num_tokens": 329514228.0, "reward": 1.2460846900939941, "reward_std": 0.13635504245758057, "rewards/accuracy_reward": 0.6731770634651184, "rewards/brier_reward": 0.8209307789802551, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.7321680188179016, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 771.7142333984375, "completions/mean_terminated_length": 767.3800659179688, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.352, "grad_norm": 0.00028805126203224063, "learning_rate": 1.8600000000000002e-06, "loss": 0.0017, "num_tokens": 331113981.0, "reward": 1.2951687574386597, "reward_std": 0.11136069148778915, "rewards/accuracy_reward": 0.74609375, "rewards/brier_reward": 0.8455306887626648, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.755859375, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 783.994140625, "completions/mean_terminated_length": 764.4735107421875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.3536, "grad_norm": 0.0006522721378132701, "learning_rate": 1.8400000000000002e-06, "loss": 0.01, "num_tokens": 332728116.0, "reward": 1.2538447380065918, "reward_std": 0.17251627147197723, "rewards/accuracy_reward": 0.6940104365348816, "rewards/brier_reward": 0.8201748728752136, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.7311719059944153, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3964.0, "completions/mean_length": 855.2318115234375, "completions/mean_terminated_length": 853.1204833984375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.3552, "grad_norm": 0.0004265681200195104, "learning_rate": 1.8200000000000002e-06, "loss": 0.0025, "num_tokens": 334456152.0, "reward": 1.2469146251678467, "reward_std": 0.1749715507030487, "rewards/accuracy_reward": 0.689453125, "rewards/brier_reward": 0.8050124049186707, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7235872149467468, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 820.0690307617188, "completions/mean_terminated_length": 820.0690307617188, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.3568, "grad_norm": 0.0002977746189571917, "learning_rate": 1.8000000000000001e-06, "loss": 0.0021, "num_tokens": 336128130.0, "reward": 1.282802939414978, "reward_std": 0.1401340514421463, "rewards/accuracy_reward": 0.7233073115348816, "rewards/brier_reward": 0.8422842025756836, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.70682293176651, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 835.8327026367188, "completions/mean_terminated_length": 831.5820922851562, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.3584, "grad_norm": 0.0003760127874556929, "learning_rate": 1.7800000000000001e-06, "loss": 0.0046, "num_tokens": 337833633.0, "reward": 1.2917841672897339, "reward_std": 0.1464608907699585, "rewards/accuracy_reward": 0.732421875, "rewards/brier_reward": 0.852433979511261, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7150260806083679, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2906.0, "completions/mean_length": 775.2877807617188, "completions/mean_terminated_length": 762.265380859375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.36, "grad_norm": 0.0006701724487356842, "learning_rate": 1.76e-06, "loss": 0.0032, "num_tokens": 339437179.0, "reward": 1.2580100297927856, "reward_std": 0.16563762724399567, "rewards/accuracy_reward": 0.67578125, "rewards/brier_reward": 0.8441312313079834, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6817578673362732, "step": 225 }, { "epoch": 0.36, "eval_completions/clipped_ratio": 0.0009765625, "eval_completions/max_length": 2246.375, "eval_completions/max_terminated_length": 2040.75, "eval_completions/mean_length": 758.134765625, "eval_completions/mean_terminated_length": 754.9086151123047, "eval_completions/min_length": 256.25, "eval_completions/min_terminated_length": 256.25, "eval_loss": 0.0, "eval_num_tokens": 339437179.0, "eval_reward": 1.254147469997406, "eval_reward_std": 0.3167754113674164, "eval_rewards/accuracy_reward": 0.6884765625, "eval_rewards/brier_reward": 0.8207807764410973, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9990234375, "eval_rewards/mean_confidence_reward": 0.708076186478138, "eval_runtime": 151.4002, "eval_samples_per_second": 6.605, "eval_steps_per_second": 0.053, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3891.0, "completions/mean_length": 740.33984375, "completions/mean_terminated_length": 735.9647827148438, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3616, "grad_norm": 0.00025495572481304407, "learning_rate": 1.74e-06, "loss": 0.0065, "num_tokens": 340988421.0, "reward": 1.3220109939575195, "reward_std": 0.11819528788328171, "rewards/accuracy_reward": 0.7799479365348816, "rewards/brier_reward": 0.8660127520561218, "rewards/confidence_one_or_zero": 0.0006510416860692203, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.7239062190055847, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 720.2389526367188, "completions/mean_terminated_length": 718.0397338867188, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3632, "grad_norm": 0.0003479801525827497, "learning_rate": 1.72e-06, "loss": 0.0048, "num_tokens": 342500276.0, "reward": 1.2929198741912842, "reward_std": 0.15440824627876282, "rewards/accuracy_reward": 0.7311198115348816, "rewards/brier_reward": 0.8553566932678223, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.71302729845047, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1935.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 690.1784057617188, "completions/mean_terminated_length": 690.1784057617188, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.3648, "grad_norm": 0.00030864260043017566, "learning_rate": 1.7000000000000002e-06, "loss": 0.0008, "num_tokens": 343978694.0, "reward": 1.3068809509277344, "reward_std": 0.12174777686595917, "rewards/accuracy_reward": 0.763671875, "rewards/brier_reward": 0.850074827671051, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7526041865348816, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 798.3236083984375, "completions/mean_terminated_length": 796.1752319335938, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.3664, "grad_norm": 0.0004133770416956395, "learning_rate": 1.6800000000000002e-06, "loss": 0.0027, "num_tokens": 345622775.0, "reward": 1.2852174043655396, "reward_std": 0.12057226896286011, "rewards/accuracy_reward": 0.7122395634651184, "rewards/brier_reward": 0.8588324189186096, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6848047375679016, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 746.4329833984375, "completions/mean_terminated_length": 739.8780517578125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.368, "grad_norm": 0.00038740818854421377, "learning_rate": 1.6600000000000002e-06, "loss": 0.0026, "num_tokens": 347180720.0, "reward": 1.3153413534164429, "reward_std": 0.14443665742874146, "rewards/accuracy_reward": 0.7682291865348816, "rewards/brier_reward": 0.8643924593925476, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.712181031703949, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2276.0, "completions/mean_length": 785.6373901367188, "completions/mean_terminated_length": 781.3213500976562, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.3696, "grad_norm": 0.0007521872757934034, "learning_rate": 1.6400000000000002e-06, "loss": 0.0053, "num_tokens": 348808323.0, "reward": 1.2982304096221924, "reward_std": 0.14852076768875122, "rewards/accuracy_reward": 0.7421875, "rewards/brier_reward": 0.8555609583854675, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.717369794845581, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 740.64453125, "completions/mean_terminated_length": 740.64453125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.3712, "grad_norm": 0.0003073941625189036, "learning_rate": 1.6200000000000002e-06, "loss": 0.0008, "num_tokens": 350358305.0, "reward": 1.3171844482421875, "reward_std": 0.16131284832954407, "rewards/accuracy_reward": 0.7760416865348816, "rewards/brier_reward": 0.8583121299743652, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7421548962593079, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3161.0, "completions/mean_length": 774.5768432617188, "completions/mean_terminated_length": 763.7295532226562, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3728, "grad_norm": 0.00028245485736988485, "learning_rate": 1.6000000000000001e-06, "loss": 0.0026, "num_tokens": 351961847.0, "reward": 1.261340856552124, "reward_std": 0.14909055829048157, "rewards/accuracy_reward": 0.6966145634651184, "rewards/brier_reward": 0.829308032989502, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.703554630279541, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3506.0, "completions/mean_length": 785.1569213867188, "completions/mean_terminated_length": 780.8402709960938, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.3744, "grad_norm": 0.0007982897805050015, "learning_rate": 1.5800000000000001e-06, "loss": 0.0034, "num_tokens": 353571368.0, "reward": 1.2633309364318848, "reward_std": 0.1445775032043457, "rewards/accuracy_reward": 0.7018229365348816, "rewards/brier_reward": 0.8261263370513916, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7377603650093079, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2117.0, "completions/max_terminated_length": 2117.0, "completions/mean_length": 752.1888427734375, "completions/mean_terminated_length": 752.1888427734375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.376, "grad_norm": 0.000666520616505295, "learning_rate": 1.56e-06, "loss": 0.0065, "num_tokens": 355145994.0, "reward": 1.2427505254745483, "reward_std": 0.14397239685058594, "rewards/accuracy_reward": 0.662109375, "rewards/brier_reward": 0.8233773112297058, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7113606929779053, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2027.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 759.0983276367188, "completions/mean_terminated_length": 759.0983276367188, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.3776, "grad_norm": 0.00033669330878183246, "learning_rate": 1.54e-06, "loss": -0.0006, "num_tokens": 356716961.0, "reward": 1.259657621383667, "reward_std": 0.13670209050178528, "rewards/accuracy_reward": 0.7102864384651184, "rewards/brier_reward": 0.8090136647224426, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7451171875, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 747.6491088867188, "completions/mean_terminated_length": 745.4677124023438, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.3792, "grad_norm": 0.0007230577175505459, "learning_rate": 1.52e-06, "loss": 0.0017, "num_tokens": 358277894.0, "reward": 1.242948055267334, "reward_std": 0.12534043192863464, "rewards/accuracy_reward": 0.6608073115348816, "rewards/brier_reward": 0.8257257342338562, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.702415406703949, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2596.0, "completions/mean_length": 777.6484375, "completions/mean_terminated_length": 771.1546020507812, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.3808, "grad_norm": 0.0003343730641063303, "learning_rate": 1.5e-06, "loss": 0.0022, "num_tokens": 359885226.0, "reward": 1.2361586093902588, "reward_std": 0.12917910516262054, "rewards/accuracy_reward": 0.642578125, "rewards/brier_reward": 0.8316782116889954, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6936458945274353, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 687.6478271484375, "completions/mean_terminated_length": 685.4273681640625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.3824, "grad_norm": 0.00031722395215183496, "learning_rate": 1.48e-06, "loss": 0.0041, "num_tokens": 361338349.0, "reward": 1.3068825006484985, "reward_std": 0.12090355157852173, "rewards/accuracy_reward": 0.755859375, "rewards/brier_reward": 0.8585416674613953, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.742382824420929, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3884.0, "completions/mean_length": 881.6614990234375, "completions/mean_terminated_length": 862.7164306640625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.384, "grad_norm": 0.0002889780735131353, "learning_rate": 1.46e-06, "loss": 0.0022, "num_tokens": 363123269.0, "reward": 1.2179514169692993, "reward_std": 0.16625238955020905, "rewards/accuracy_reward": 0.6373698115348816, "rewards/brier_reward": 0.8050293326377869, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.70305997133255, "step": 240 }, { "epoch": 0.384, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2273.5, "eval_completions/max_terminated_length": 2273.5, "eval_completions/mean_length": 770.6792373657227, "eval_completions/mean_terminated_length": 770.6792373657227, "eval_completions/min_length": 265.875, "eval_completions/min_terminated_length": 265.875, "eval_loss": 0.0, "eval_num_tokens": 363123269.0, "eval_reward": 1.2633255422115326, "eval_reward_std": 0.3119080364704132, "eval_rewards/accuracy_reward": 0.6923828125, "eval_rewards/brier_reward": 0.8342539891600609, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.7120410278439522, "eval_runtime": 154.5074, "eval_samples_per_second": 6.472, "eval_steps_per_second": 0.052, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3960.0, "completions/mean_length": 871.5416870117188, "completions/mean_terminated_length": 861.0110473632812, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.3856, "grad_norm": 0.0003040562442038208, "learning_rate": 1.44e-06, "loss": 0.0024, "num_tokens": 364886277.0, "reward": 1.1704692840576172, "reward_std": 0.13953690230846405, "rewards/accuracy_reward": 0.5670573115348816, "rewards/brier_reward": 0.7777741551399231, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.6642773151397705, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3975.0, "completions/mean_length": 712.62109375, "completions/mean_terminated_length": 710.4169311523438, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.3872, "grad_norm": 0.0003581687924452126, "learning_rate": 1.42e-06, "loss": 0.0047, "num_tokens": 366405023.0, "reward": 1.261784553527832, "reward_std": 0.14283758401870728, "rewards/accuracy_reward": 0.6842448115348816, "rewards/brier_reward": 0.8406119346618652, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7110025882720947, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 770.1282958984375, "completions/mean_terminated_length": 770.1282958984375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.3888, "grad_norm": 0.00025888290838338435, "learning_rate": 1.4000000000000001e-06, "loss": 0.0027, "num_tokens": 368003972.0, "reward": 1.2508610486984253, "reward_std": 0.12878620624542236, "rewards/accuracy_reward": 0.673828125, "rewards/brier_reward": 0.8278799653053284, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6979036331176758, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3233.0, "completions/mean_length": 794.7233276367188, "completions/mean_terminated_length": 790.4191284179688, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.3904, "grad_norm": 0.00026731810066848993, "learning_rate": 1.3800000000000001e-06, "loss": 0.0032, "num_tokens": 369649211.0, "reward": 1.2403613328933716, "reward_std": 0.12547096610069275, "rewards/accuracy_reward": 0.6438801884651184, "rewards/brier_reward": 0.8381322026252747, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6068164110183716, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 760.7982177734375, "completions/mean_terminated_length": 758.6253662109375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.392, "grad_norm": 0.00031815480906516314, "learning_rate": 1.3600000000000001e-06, "loss": 0.0032, "num_tokens": 371227461.0, "reward": 1.2480862140655518, "reward_std": 0.14381472766399384, "rewards/accuracy_reward": 0.6692708134651184, "rewards/brier_reward": 0.8275385499000549, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6939193606376648, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4002.0, "completions/mean_length": 700.7572021484375, "completions/mean_terminated_length": 694.1128540039062, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.3936, "grad_norm": 0.00025653981720097363, "learning_rate": 1.34e-06, "loss": 0.0052, "num_tokens": 372727632.0, "reward": 1.2953475713729858, "reward_std": 0.12381541728973389, "rewards/accuracy_reward": 0.7428385615348816, "rewards/brier_reward": 0.8504469394683838, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6876649856567383, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3340.0, "completions/mean_length": 779.5045776367188, "completions/mean_terminated_length": 777.3439331054688, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.3952, "grad_norm": 0.0002414540504105389, "learning_rate": 1.32e-06, "loss": 0.0022, "num_tokens": 374336375.0, "reward": 1.295057773590088, "reward_std": 0.12152569741010666, "rewards/accuracy_reward": 0.7473958134651184, "rewards/brier_reward": 0.8433578014373779, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6496419310569763, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2675.0, "completions/mean_length": 828.4577026367188, "completions/mean_terminated_length": 826.3289794921875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.3968, "grad_norm": 0.0002858421648852527, "learning_rate": 1.3e-06, "loss": 0.0015, "num_tokens": 376020150.0, "reward": 1.2264795303344727, "reward_std": 0.14337709546089172, "rewards/accuracy_reward": 0.6451823115348816, "rewards/brier_reward": 0.8084152340888977, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6250585913658142, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 756.05859375, "completions/mean_terminated_length": 756.05859375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.3984, "grad_norm": 0.0003396105021238327, "learning_rate": 1.28e-06, "loss": 0.0008, "num_tokens": 377580624.0, "reward": 1.2818405628204346, "reward_std": 0.13051815330982208, "rewards/accuracy_reward": 0.7337239384651184, "rewards/brier_reward": 0.8299440741539001, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.64676433801651, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2722.0, "completions/max_terminated_length": 2722.0, "completions/mean_length": 735.8112182617188, "completions/mean_terminated_length": 735.8112182617188, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4, "grad_norm": 0.00028913895948790014, "learning_rate": 1.26e-06, "loss": 0.0013, "num_tokens": 379128142.0, "reward": 1.2330697774887085, "reward_std": 0.1278759390115738, "rewards/accuracy_reward": 0.6692708134651184, "rewards/brier_reward": 0.798809289932251, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6231966018676758, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2649.0, "completions/max_terminated_length": 2649.0, "completions/mean_length": 703.1237182617188, "completions/mean_terminated_length": 703.1237182617188, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.4016, "grad_norm": 0.0002372190065216273, "learning_rate": 1.2400000000000002e-06, "loss": 0.0008, "num_tokens": 380622508.0, "reward": 1.2969684600830078, "reward_std": 0.0962226614356041, "rewards/accuracy_reward": 0.732421875, "rewards/brier_reward": 0.8615023493766785, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6363607048988342, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 708.8053588867188, "completions/mean_terminated_length": 706.5986938476562, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.4032, "grad_norm": 0.00027465366292744875, "learning_rate": 1.2200000000000002e-06, "loss": 0.0011, "num_tokens": 382128385.0, "reward": 1.2713468074798584, "reward_std": 0.12537533044815063, "rewards/accuracy_reward": 0.7109375, "rewards/brier_reward": 0.8323941826820374, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6359700560569763, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3347.0, "completions/max_terminated_length": 3347.0, "completions/mean_length": 753.865234375, "completions/mean_terminated_length": 753.865234375, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.4048, "grad_norm": 0.00024785054847598076, "learning_rate": 1.2000000000000002e-06, "loss": 0.0037, "num_tokens": 383690898.0, "reward": 1.338404655456543, "reward_std": 0.1253538429737091, "rewards/accuracy_reward": 0.8372395634651184, "rewards/brier_reward": 0.8395573496818542, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6294922232627869, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3947.0, "completions/mean_length": 765.408203125, "completions/mean_terminated_length": 756.712158203125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.4064, "grad_norm": 0.00027560829767026007, "learning_rate": 1.1800000000000001e-06, "loss": 0.0052, "num_tokens": 385283365.0, "reward": 1.3062567710876465, "reward_std": 0.1441681683063507, "rewards/accuracy_reward": 0.7701823115348816, "rewards/brier_reward": 0.8449217677116394, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.6705207824707031, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 760.41015625, "completions/mean_terminated_length": 753.8826293945312, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.408, "grad_norm": 0.00024119486624840647, "learning_rate": 1.1600000000000001e-06, "loss": 0.004, "num_tokens": 386865019.0, "reward": 1.2422809600830078, "reward_std": 0.1103820651769638, "rewards/accuracy_reward": 0.6438801884651184, "rewards/brier_reward": 0.8426220417022705, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6386393308639526, "step": 255 }, { "epoch": 0.408, "eval_completions/clipped_ratio": 0.0009765625, "eval_completions/max_length": 2392.875, "eval_completions/max_terminated_length": 2110.5, "eval_completions/mean_length": 744.8275299072266, "eval_completions/mean_terminated_length": 741.5473785400391, "eval_completions/min_length": 257.75, "eval_completions/min_terminated_length": 257.75, "eval_loss": 0.0, "eval_num_tokens": 386865019.0, "eval_reward": 1.264432892203331, "eval_reward_std": 0.2888818830251694, "eval_rewards/accuracy_reward": 0.697265625, "eval_rewards/brier_reward": 0.8325634673237801, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9990234375, "eval_rewards/mean_confidence_reward": 0.6566406339406967, "eval_runtime": 159.3931, "eval_samples_per_second": 6.274, "eval_steps_per_second": 0.05, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2839.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 836.4108276367188, "completions/mean_terminated_length": 836.4108276367188, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.4096, "grad_norm": 0.0002666335494723171, "learning_rate": 1.14e-06, "loss": 0.0023, "num_tokens": 388557970.0, "reward": 1.2565433979034424, "reward_std": 0.13489803671836853, "rewards/accuracy_reward": 0.69921875, "rewards/brier_reward": 0.8138556480407715, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6110547184944153, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3972.0, "completions/mean_length": 773.7396240234375, "completions/mean_terminated_length": 769.4080810546875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.4112, "grad_norm": 0.00020281363686081022, "learning_rate": 1.12e-06, "loss": 0.0018, "num_tokens": 390170498.0, "reward": 1.2956758737564087, "reward_std": 0.1046941801905632, "rewards/accuracy_reward": 0.7291666865348816, "rewards/brier_reward": 0.8634737133979797, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.6618945002555847, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 761.8177490234375, "completions/mean_terminated_length": 761.8177490234375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.4128, "grad_norm": 0.00026654975954443216, "learning_rate": 1.1e-06, "loss": 0.0016, "num_tokens": 391763466.0, "reward": 1.2971115112304688, "reward_std": 0.10875920951366425, "rewards/accuracy_reward": 0.7356770634651184, "rewards/brier_reward": 0.8585322499275208, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6729037165641785, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2917.0, "completions/max_terminated_length": 2917.0, "completions/mean_length": 775.6771240234375, "completions/mean_terminated_length": 775.6771240234375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.4144, "grad_norm": 0.0002941343991551548, "learning_rate": 1.08e-06, "loss": 0.001, "num_tokens": 393364922.0, "reward": 1.2899754047393799, "reward_std": 0.13599133491516113, "rewards/accuracy_reward": 0.7298176884651184, "rewards/brier_reward": 0.8501192927360535, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.6869401335716248, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2669.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 702.1484375, "completions/mean_terminated_length": 702.1484375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.416, "grad_norm": 0.00027645519003272057, "learning_rate": 1.06e-06, "loss": 0.0031, "num_tokens": 394856446.0, "reward": 1.3200023174285889, "reward_std": 0.1067432165145874, "rewards/accuracy_reward": 0.7734375, "rewards/brier_reward": 0.8665527701377869, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7093749642372131, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3817.0, "completions/mean_length": 787.8971557617188, "completions/mean_terminated_length": 781.42333984375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.4176, "grad_norm": 0.0003562932542990893, "learning_rate": 1.04e-06, "loss": 0.006, "num_tokens": 396477760.0, "reward": 1.2434288263320923, "reward_std": 0.16530069708824158, "rewards/accuracy_reward": 0.69921875, "rewards/brier_reward": 0.7895780205726624, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6847135424613953, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3643.0, "completions/mean_length": 778.4889526367188, "completions/mean_terminated_length": 774.1636352539062, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4192, "grad_norm": 0.00029440957587212324, "learning_rate": 1.02e-06, "loss": 0.0022, "num_tokens": 398092143.0, "reward": 1.2690677642822266, "reward_std": 0.13717547059059143, "rewards/accuracy_reward": 0.703125, "rewards/brier_reward": 0.8369496464729309, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.6895464062690735, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 809.5104370117188, "completions/mean_terminated_length": 803.0789184570312, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.4208, "grad_norm": 0.000310330098727718, "learning_rate": 1.0000000000000002e-06, "loss": 0.0019, "num_tokens": 399745791.0, "reward": 1.2950420379638672, "reward_std": 0.14444516599178314, "rewards/accuracy_reward": 0.7513020634651184, "rewards/brier_reward": 0.8420231342315674, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.7073893547058105, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 726.4349365234375, "completions/mean_terminated_length": 713.220947265625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.4224, "grad_norm": 0.0003654451866168529, "learning_rate": 9.800000000000001e-07, "loss": 0.0032, "num_tokens": 401269595.0, "reward": 1.2749302387237549, "reward_std": 0.1556210219860077, "rewards/accuracy_reward": 0.734375, "rewards/brier_reward": 0.8193759918212891, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.7758333086967468, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 778.18359375, "completions/mean_terminated_length": 776.0221557617188, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.424, "grad_norm": 0.0002868030860554427, "learning_rate": 9.600000000000001e-07, "loss": 0.0049, "num_tokens": 402873269.0, "reward": 1.266740083694458, "reward_std": 0.11665759980678558, "rewards/accuracy_reward": 0.677734375, "rewards/brier_reward": 0.8563827872276306, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6947482228279114, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3752.0, "completions/mean_length": 749.3411865234375, "completions/mean_terminated_length": 742.7919311523438, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.4256, "grad_norm": 0.0003816651296801865, "learning_rate": 9.400000000000001e-07, "loss": 0.006, "num_tokens": 404429217.0, "reward": 1.2989994287490845, "reward_std": 0.17248794436454773, "rewards/accuracy_reward": 0.7526041865348816, "rewards/brier_reward": 0.8473322987556458, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.7638998031616211, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 719.0540771484375, "completions/mean_terminated_length": 714.6512451171875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.4272, "grad_norm": 0.00031786985346116126, "learning_rate": 9.200000000000001e-07, "loss": 0.0057, "num_tokens": 405954996.0, "reward": 1.3021490573883057, "reward_std": 0.12548550963401794, "rewards/accuracy_reward": 0.7591145634651184, "rewards/brier_reward": 0.8464704155921936, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7456250190734863, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 736.9088745117188, "completions/mean_terminated_length": 734.7205200195312, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.4288, "grad_norm": 0.00036653323331847787, "learning_rate": 9.000000000000001e-07, "loss": 0.0043, "num_tokens": 407492232.0, "reward": 1.2662080526351929, "reward_std": 0.14813528954982758, "rewards/accuracy_reward": 0.7063801884651184, "rewards/brier_reward": 0.8273225426673889, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7625325322151184, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 780.3724365234375, "completions/mean_terminated_length": 743.2652587890625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.4304, "grad_norm": 0.000349526439094916, "learning_rate": 8.8e-07, "loss": 0.0081, "num_tokens": 409107300.0, "reward": 1.2132786512374878, "reward_std": 0.14902278780937195, "rewards/accuracy_reward": 0.6438801884651184, "rewards/brier_reward": 0.7943815588951111, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.7127604484558105, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3211.0, "completions/mean_length": 740.8619995117188, "completions/mean_terminated_length": 738.6762084960938, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.432, "grad_norm": 0.0003539361641742289, "learning_rate": 8.6e-07, "loss": -0.0008, "num_tokens": 410670672.0, "reward": 1.249704360961914, "reward_std": 0.13634084165096283, "rewards/accuracy_reward": 0.6907551884651184, "rewards/brier_reward": 0.8092894554138184, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7443880438804626, "step": 270 }, { "epoch": 0.432, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1871.5, "eval_completions/max_terminated_length": 1871.5, "eval_completions/mean_length": 727.5479278564453, "eval_completions/mean_terminated_length": 727.5479278564453, "eval_completions/min_length": 240.375, "eval_completions/min_terminated_length": 240.375, "eval_loss": 0.0, "eval_num_tokens": 410670672.0, "eval_reward": 1.2578789591789246, "eval_reward_std": 0.33254586160182953, "eval_rewards/accuracy_reward": 0.6953125, "eval_rewards/brier_reward": 0.8204302713274956, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.7551953345537186, "eval_runtime": 127.7681, "eval_samples_per_second": 7.827, "eval_steps_per_second": 0.063, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2169.0, "completions/max_terminated_length": 2169.0, "completions/mean_length": 644.4342651367188, "completions/mean_terminated_length": 644.4342651367188, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.4336, "grad_norm": 0.0010783898178488016, "learning_rate": 8.400000000000001e-07, "loss": 0.0004, "num_tokens": 412062059.0, "reward": 1.289198875427246, "reward_std": 0.12434270977973938, "rewards/accuracy_reward": 0.734375, "rewards/brier_reward": 0.8440071940422058, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7678385376930237, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2180.0, "completions/mean_length": 800.10546875, "completions/mean_terminated_length": 797.9583129882812, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.4352, "grad_norm": 0.00032713430118747056, "learning_rate": 8.200000000000001e-07, "loss": 0.0022, "num_tokens": 413707021.0, "reward": 1.2570998668670654, "reward_std": 0.16381941735744476, "rewards/accuracy_reward": 0.6959635615348816, "rewards/brier_reward": 0.8188721537590027, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7517903447151184, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3427.0, "completions/max_terminated_length": 3427.0, "completions/mean_length": 727.3236083984375, "completions/mean_terminated_length": 727.3236083984375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4368, "grad_norm": 0.0004381242615636438, "learning_rate": 8.000000000000001e-07, "loss": 0.0034, "num_tokens": 415236350.0, "reward": 1.2997373342514038, "reward_std": 0.16025710105895996, "rewards/accuracy_reward": 0.7252604365348816, "rewards/brier_reward": 0.8741992115974426, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7495443224906921, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2531.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 692.6432495117188, "completions/mean_terminated_length": 692.6432495117188, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4384, "grad_norm": 0.00036950717912986875, "learning_rate": 7.8e-07, "loss": 0.002, "num_tokens": 416709914.0, "reward": 1.2723698616027832, "reward_std": 0.12231775373220444, "rewards/accuracy_reward": 0.71484375, "rewards/brier_reward": 0.8298807740211487, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7495377659797668, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 748.4603271484375, "completions/mean_terminated_length": 744.0958251953125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.44, "grad_norm": 0.0003206534020137042, "learning_rate": 7.6e-07, "loss": -0.0002, "num_tokens": 418268029.0, "reward": 1.2111705541610718, "reward_std": 0.13704757392406464, "rewards/accuracy_reward": 0.6510416865348816, "rewards/brier_reward": 0.7725862860679626, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7523762583732605, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3227.0, "completions/mean_length": 732.4876708984375, "completions/mean_terminated_length": 728.1023559570312, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.4416, "grad_norm": 0.0003790144110098481, "learning_rate": 7.4e-07, "loss": 0.0031, "num_tokens": 419819242.0, "reward": 1.2106105089187622, "reward_std": 0.15849772095680237, "rewards/accuracy_reward": 0.6171875, "rewards/brier_reward": 0.805320680141449, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.7409830689430237, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 790.78515625, "completions/mean_terminated_length": 788.6318969726562, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.4432, "grad_norm": 0.0004914980963803828, "learning_rate": 7.2e-07, "loss": 0.0023, "num_tokens": 421450560.0, "reward": 1.2584004402160645, "reward_std": 0.18821319937705994, "rewards/accuracy_reward": 0.7083333134651184, "rewards/brier_reward": 0.8091031908988953, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7650065422058105, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3179.0, "completions/max_terminated_length": 3179.0, "completions/mean_length": 695.8073120117188, "completions/mean_terminated_length": 695.8073120117188, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.4448, "grad_norm": 0.0004518725036177784, "learning_rate": 7.000000000000001e-07, "loss": -0.0014, "num_tokens": 422932568.0, "reward": 1.261359453201294, "reward_std": 0.1607692986726761, "rewards/accuracy_reward": 0.697265625, "rewards/brier_reward": 0.8254378437995911, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7701497673988342, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 4021.0, "completions/mean_length": 737.0358276367188, "completions/mean_terminated_length": 734.8475341796875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.4464, "grad_norm": 0.000430735235568136, "learning_rate": 6.800000000000001e-07, "loss": 0.0043, "num_tokens": 424473423.0, "reward": 1.2775459289550781, "reward_std": 0.1455737203359604, "rewards/accuracy_reward": 0.7213541865348816, "rewards/brier_reward": 0.8350244164466858, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.760449230670929, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2218.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 739.4225463867188, "completions/mean_terminated_length": 739.4225463867188, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.448, "grad_norm": 0.00039900391129776835, "learning_rate": 6.6e-07, "loss": 0.0011, "num_tokens": 426027832.0, "reward": 1.2644761800765991, "reward_std": 0.15900836884975433, "rewards/accuracy_reward": 0.7037760615348816, "rewards/brier_reward": 0.8251611590385437, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.75048828125, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1831.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 717.345703125, "completions/mean_terminated_length": 717.345703125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.4496, "grad_norm": 0.000307876558508724, "learning_rate": 6.4e-07, "loss": -0.0006, "num_tokens": 427534091.0, "reward": 1.3375110626220703, "reward_std": 0.12728455662727356, "rewards/accuracy_reward": 0.7942708134651184, "rewards/brier_reward": 0.8807356953620911, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7784505486488342, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2484.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 708.109375, "completions/mean_terminated_length": 708.109375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.4512, "grad_norm": 0.0003453131066635251, "learning_rate": 6.200000000000001e-07, "loss": 0.0028, "num_tokens": 429033107.0, "reward": 1.3160195350646973, "reward_std": 0.12515944242477417, "rewards/accuracy_reward": 0.76953125, "rewards/brier_reward": 0.8624919056892395, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7952799797058105, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3908.0, "completions/mean_length": 751.6784057617188, "completions/mean_terminated_length": 749.4996337890625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.4528, "grad_norm": 0.0004134805640205741, "learning_rate": 6.000000000000001e-07, "loss": 0.0012, "num_tokens": 430593861.0, "reward": 1.266950249671936, "reward_std": 0.14746582508087158, "rewards/accuracy_reward": 0.6979166865348816, "rewards/brier_reward": 0.8366195559501648, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.757519543170929, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3333.0, "completions/mean_length": 747.7135620117188, "completions/mean_terminated_length": 736.778564453125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.4544, "grad_norm": 0.0003506985376589, "learning_rate": 5.800000000000001e-07, "loss": 0.0033, "num_tokens": 432154573.0, "reward": 1.2804300785064697, "reward_std": 0.13715115189552307, "rewards/accuracy_reward": 0.7259114384651184, "rewards/brier_reward": 0.8381884694099426, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.765332043170929, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2632.0, "completions/max_terminated_length": 2632.0, "completions/mean_length": 687.2825927734375, "completions/mean_terminated_length": 687.2825927734375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.456, "grad_norm": 0.00046250695595517755, "learning_rate": 5.6e-07, "loss": 0.0011, "num_tokens": 433617759.0, "reward": 1.2552844285964966, "reward_std": 0.14801360666751862, "rewards/accuracy_reward": 0.71484375, "rewards/brier_reward": 0.7957096099853516, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7628254890441895, "step": 285 }, { "epoch": 0.456, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 2101.0, "eval_completions/max_terminated_length": 2101.0, "eval_completions/mean_length": 728.6219253540039, "eval_completions/mean_terminated_length": 728.6219253540039, "eval_completions/min_length": 238.5, "eval_completions/min_terminated_length": 238.5, "eval_loss": 0.0, "eval_num_tokens": 433617759.0, "eval_reward": 1.270130768418312, "eval_reward_std": 0.3241976350545883, "eval_rewards/accuracy_reward": 0.705078125, "eval_rewards/brier_reward": 0.8351684510707855, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.7483887001872063, "eval_runtime": 142.119, "eval_samples_per_second": 7.036, "eval_steps_per_second": 0.056, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 688.84765625, "completions/mean_terminated_length": 688.84765625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.4576, "grad_norm": 0.0004300959990359843, "learning_rate": 5.4e-07, "loss": 0.004, "num_tokens": 435085333.0, "reward": 1.3292810916900635, "reward_std": 0.13442009687423706, "rewards/accuracy_reward": 0.7916666865348816, "rewards/brier_reward": 0.866879940032959, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7669596672058105, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 789.4154052734375, "completions/mean_terminated_length": 789.4154052734375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.4592, "grad_norm": 0.0005124983144924045, "learning_rate": 5.2e-07, "loss": 0.0015, "num_tokens": 436721267.0, "reward": 1.1880624294281006, "reward_std": 0.1657973676919937, "rewards/accuracy_reward": 0.6015625, "rewards/brier_reward": 0.7745475172996521, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7263021469116211, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3994.0, "completions/max_terminated_length": 3994.0, "completions/mean_length": 806.50390625, "completions/mean_terminated_length": 806.50390625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.4608, "grad_norm": 0.00037309739855118096, "learning_rate": 5.000000000000001e-07, "loss": 0.0051, "num_tokens": 438379289.0, "reward": 1.2087801694869995, "reward_std": 0.16232234239578247, "rewards/accuracy_reward": 0.62109375, "rewards/brier_reward": 0.7971029281616211, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7347005009651184, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3991.0, "completions/mean_length": 790.900390625, "completions/mean_terminated_length": 764.8759765625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.4624, "grad_norm": 0.00035559016396291554, "learning_rate": 4.800000000000001e-07, "loss": 0.0038, "num_tokens": 439999456.0, "reward": 1.2059985399246216, "reward_std": 0.14531877636909485, "rewards/accuracy_reward": 0.6451823115348816, "rewards/brier_reward": 0.77591472864151, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.728515625, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2003.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 671.8255615234375, "completions/mean_terminated_length": 671.8255615234375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.464, "grad_norm": 0.00042939442209899426, "learning_rate": 4.6000000000000004e-07, "loss": 0.0015, "num_tokens": 441431572.0, "reward": 1.307799220085144, "reward_std": 0.0975048765540123, "rewards/accuracy_reward": 0.7493489384651184, "rewards/brier_reward": 0.8662341237068176, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7652214169502258, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2174.0, "completions/max_terminated_length": 2174.0, "completions/mean_length": 736.3795776367188, "completions/mean_terminated_length": 736.3795776367188, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4656, "grad_norm": 0.00034132087603211403, "learning_rate": 4.4e-07, "loss": 0.0007, "num_tokens": 442963451.0, "reward": 1.2967678308486938, "reward_std": 0.1268530935049057, "rewards/accuracy_reward": 0.7486979365348816, "rewards/brier_reward": 0.844822108745575, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7768815159797668, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3519.0, "completions/mean_length": 767.67578125, "completions/mean_terminated_length": 758.9856567382812, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.4672, "grad_norm": 0.0003433678357396275, "learning_rate": 4.2000000000000006e-07, "loss": 0.0052, "num_tokens": 444558025.0, "reward": 1.2763804197311401, "reward_std": 0.13755780458450317, "rewards/accuracy_reward": 0.703125, "rewards/brier_reward": 0.8528766632080078, "rewards/confidence_one_or_zero": 0.0006510416860692203, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.7110547423362732, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0013020833333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 738.2916870117188, "completions/mean_terminated_length": 733.9139404296875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.4688, "grad_norm": 0.00045168292126618326, "learning_rate": 4.0000000000000003e-07, "loss": 0.0043, "num_tokens": 446111657.0, "reward": 1.2984068393707275, "reward_std": 0.14239460229873657, "rewards/accuracy_reward": 0.7604166865348816, "rewards/brier_reward": 0.837683916091919, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9986979365348816, "rewards/mean_confidence_reward": 0.744824230670929, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2268.0, "completions/max_terminated_length": 2268.0, "completions/mean_length": 643.2877807617188, "completions/mean_terminated_length": 643.2877807617188, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.4704, "grad_norm": 0.00040195477777160704, "learning_rate": 3.8e-07, "loss": 0.0004, "num_tokens": 447493219.0, "reward": 1.2462185621261597, "reward_std": 0.09801927208900452, "rewards/accuracy_reward": 0.67578125, "rewards/brier_reward": 0.8166406750679016, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7602865099906921, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2217.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 742.3841552734375, "completions/mean_terminated_length": 742.3841552734375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.472, "grad_norm": 0.0003178889455739409, "learning_rate": 3.6e-07, "loss": 0.002, "num_tokens": 449038065.0, "reward": 1.3069126605987549, "reward_std": 0.12454438209533691, "rewards/accuracy_reward": 0.744140625, "rewards/brier_reward": 0.8696691393852234, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7631662487983704, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 709.8346557617188, "completions/mean_terminated_length": 709.8346557617188, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4736, "grad_norm": 0.0003638795460574329, "learning_rate": 3.4000000000000003e-07, "loss": 0.0001, "num_tokens": 450550259.0, "reward": 1.2740726470947266, "reward_std": 0.10870181024074554, "rewards/accuracy_reward": 0.7298176884651184, "rewards/brier_reward": 0.8183121681213379, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.75921231508255, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3491.0, "completions/mean_length": 776.4498901367188, "completions/mean_terminated_length": 767.7826538085938, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.4752, "grad_norm": 0.0005423931288532913, "learning_rate": 3.2e-07, "loss": 0.0026, "num_tokens": 452163110.0, "reward": 1.2303560972213745, "reward_std": 0.1594705730676651, "rewards/accuracy_reward": 0.6360676884651184, "rewards/brier_reward": 0.8272339701652527, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7227473855018616, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 703.12890625, "completions/mean_terminated_length": 700.9185791015625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.4768, "grad_norm": 0.0004639087710529566, "learning_rate": 3.0000000000000004e-07, "loss": 0.0042, "num_tokens": 453645292.0, "reward": 1.256563663482666, "reward_std": 0.1316872537136078, "rewards/accuracy_reward": 0.6803385615348816, "rewards/brier_reward": 0.8334245085716248, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7559245228767395, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3181.0, "completions/mean_length": 789.041015625, "completions/mean_terminated_length": 780.4066772460938, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.4784, "grad_norm": 0.00033448453177697957, "learning_rate": 2.8e-07, "loss": 0.004, "num_tokens": 455260683.0, "reward": 1.3047873973846436, "reward_std": 0.14105179905891418, "rewards/accuracy_reward": 0.7532551884651184, "rewards/brier_reward": 0.8589094281196594, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.7109375, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 717.736328125, "completions/mean_terminated_length": 715.5355224609375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.48, "grad_norm": 0.0007051600841805339, "learning_rate": 2.6e-07, "loss": 0.0038, "num_tokens": 456772566.0, "reward": 1.2453429698944092, "reward_std": 0.1548844873905182, "rewards/accuracy_reward": 0.671875, "rewards/brier_reward": 0.8194465637207031, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7630208134651184, "step": 300 }, { "epoch": 0.48, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 1676.0, "eval_completions/max_terminated_length": 1676.0, "eval_completions/mean_length": 726.3635101318359, "eval_completions/mean_terminated_length": 726.3635101318359, "eval_completions/min_length": 243.875, "eval_completions/min_terminated_length": 243.875, "eval_loss": 0.0, "eval_num_tokens": 456772566.0, "eval_reward": 1.256611406803131, "eval_reward_std": 0.32671603187918663, "eval_rewards/accuracy_reward": 0.6884765625, "eval_rewards/brier_reward": 0.8247314542531967, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.7374023571610451, "eval_runtime": 116.1412, "eval_samples_per_second": 8.61, "eval_steps_per_second": 0.069, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3328.0, "completions/max_terminated_length": 3328.0, "completions/mean_length": 757.3112182617188, "completions/mean_terminated_length": 757.3112182617188, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.4816, "grad_norm": 0.0003024719189852476, "learning_rate": 2.4000000000000003e-07, "loss": 0.0012, "num_tokens": 458335668.0, "reward": 1.2854983806610107, "reward_std": 0.12121113389730453, "rewards/accuracy_reward": 0.7350260615348816, "rewards/brier_reward": 0.835955798625946, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.74149090051651, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 682.267578125, "completions/mean_terminated_length": 682.267578125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.4832, "grad_norm": 0.0003994290600530803, "learning_rate": 2.2e-07, "loss": 0.0031, "num_tokens": 459787119.0, "reward": 1.3686707019805908, "reward_std": 0.12233855575323105, "rewards/accuracy_reward": 0.8489583134651184, "rewards/brier_reward": 0.8883671760559082, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7796158790588379, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 738.7057495117188, "completions/mean_terminated_length": 738.7057495117188, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.4848, "grad_norm": 0.0003309908788651228, "learning_rate": 2.0000000000000002e-07, "loss": 0.0018, "num_tokens": 461338763.0, "reward": 1.308286190032959, "reward_std": 0.1485404372215271, "rewards/accuracy_reward": 0.7506510615348816, "rewards/brier_reward": 0.8659065365791321, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.72119140625, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2432.0, "completions/max_terminated_length": 2432.0, "completions/mean_length": 761.4088745117188, "completions/mean_terminated_length": 761.4088745117188, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.4864, "grad_norm": 0.00035207331529818475, "learning_rate": 1.8e-07, "loss": 0.0031, "num_tokens": 462922143.0, "reward": 1.3161344528198242, "reward_std": 0.12203608453273773, "rewards/accuracy_reward": 0.771484375, "rewards/brier_reward": 0.8607698082923889, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7275716662406921, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0026041666666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 693.7181396484375, "completions/mean_terminated_length": 684.8348388671875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.488, "grad_norm": 0.0004044874513056129, "learning_rate": 1.6e-07, "loss": 0.0032, "num_tokens": 464399310.0, "reward": 1.3552474975585938, "reward_std": 0.11239050328731537, "rewards/accuracy_reward": 0.8177083134651184, "rewards/brier_reward": 0.8953760266304016, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.75244140625, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 746.619140625, "completions/mean_terminated_length": 746.619140625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4896, "grad_norm": 0.00040862540481612086, "learning_rate": 1.4e-07, "loss": 0.0017, "num_tokens": 465959909.0, "reward": 1.2425869703292847, "reward_std": 0.14031323790550232, "rewards/accuracy_reward": 0.6647135615348816, "rewards/brier_reward": 0.8204459547996521, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7080078125, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 795.4928588867188, "completions/mean_terminated_length": 793.3426513671875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.4912, "grad_norm": 0.0002932342467829585, "learning_rate": 1.2000000000000002e-07, "loss": 0.0042, "num_tokens": 467581562.0, "reward": 1.28499174118042, "reward_std": 0.11033116281032562, "rewards/accuracy_reward": 0.712890625, "rewards/brier_reward": 0.857729434967041, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.711621105670929, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 676.2220458984375, "completions/mean_terminated_length": 676.2220458984375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.4928, "grad_norm": 0.00029682141030207276, "learning_rate": 1.0000000000000001e-07, "loss": 0.0035, "num_tokens": 469033743.0, "reward": 1.3275282382965088, "reward_std": 0.10203856229782104, "rewards/accuracy_reward": 0.79296875, "rewards/brier_reward": 0.8620722889900208, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7659050822257996, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2309.0, "completions/mean_length": 668.6361083984375, "completions/mean_terminated_length": 666.4032592773438, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.4944, "grad_norm": 0.000334031181409955, "learning_rate": 8e-08, "loss": 0.002, "num_tokens": 470472768.0, "reward": 1.230898380279541, "reward_std": 0.1208835244178772, "rewards/accuracy_reward": 0.64453125, "rewards/brier_reward": 0.8179028034210205, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.6882227063179016, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 756.2454833984375, "completions/mean_terminated_length": 754.0697021484375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.496, "grad_norm": 0.0003742491244338453, "learning_rate": 6.000000000000001e-08, "loss": 0.0034, "num_tokens": 472047289.0, "reward": 1.2818677425384521, "reward_std": 0.1485479772090912, "rewards/accuracy_reward": 0.7233073115348816, "rewards/brier_reward": 0.841064453125, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7341146469116211, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3685.0, "completions/mean_length": 824.46484375, "completions/mean_terminated_length": 822.3335571289062, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.4976, "grad_norm": 0.0004076404729858041, "learning_rate": 4e-08, "loss": -0.0001, "num_tokens": 473725443.0, "reward": 1.2516274452209473, "reward_std": 0.1677635759115219, "rewards/accuracy_reward": 0.6881510615348816, "rewards/brier_reward": 0.8157405853271484, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7047200202941895, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0006510416666666297, "completions/max_length": 4096.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 725.3548583984375, "completions/mean_terminated_length": 723.158935546875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.4992, "grad_norm": 0.00032240949803963304, "learning_rate": 2e-08, "loss": 0.0016, "num_tokens": 475261988.0, "reward": 1.279578685760498, "reward_std": 0.14045238494873047, "rewards/accuracy_reward": 0.720703125, "rewards/brier_reward": 0.8390908241271973, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9993489384651184, "rewards/mean_confidence_reward": 0.7191992402076721, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3622.0, "completions/max_terminated_length": 3622.0, "completions/mean_length": 768.0338745117188, "completions/mean_terminated_length": 768.0338745117188, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5008, "grad_norm": 0.0002836451749317348, "learning_rate": 0.0, "loss": 0.003, "num_tokens": 476855928.0, "reward": 1.298927664756775, "reward_std": 0.10419078916311264, "rewards/accuracy_reward": 0.7272135615348816, "rewards/brier_reward": 0.8706266283988953, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.7468425631523132, "step": 313 }, { "epoch": 0.5008, "step": 313, "total_flos": 0.0, "train_loss": 0.005839667457993479, "train_runtime": 60601.0049, "train_samples_per_second": 0.248, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 313, "num_input_tokens_seen": 476855928, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }