{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4992, "eval_steps": 15, "global_step": 78, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3781.0, "completions/mean_length": 555.3671875, "completions/mean_terminated_length": 484.836669921875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.0064, "grad_norm": 0.003704642876982689, "learning_rate": 3.125e-07, "loss": 0.0241, "num_tokens": 1107052.0, "reward": 0.3880208432674408, "reward_std": 0.3527620732784271, "rewards/accuracy_reward": 0.236328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5397135615348816, "rewards/mean_confidence_reward": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01888020833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 593.521484375, "completions/mean_terminated_length": 526.1214599609375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0128, "grad_norm": 0.002923233201727271, "learning_rate": 6.25e-07, "loss": 0.0154, "num_tokens": 2274701.0, "reward": 0.3753255307674408, "reward_std": 0.3523617386817932, "rewards/accuracy_reward": 0.2428385466337204, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5078125, "rewards/mean_confidence_reward": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02278645833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 621.548828125, "completions/mean_terminated_length": 540.5323486328125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0192, "grad_norm": 0.002838415326550603, "learning_rate": 9.375000000000001e-07, "loss": 0.0186, "num_tokens": 3489440.0, "reward": 0.4033203125, "reward_std": 0.3530685305595398, "rewards/accuracy_reward": 0.2526041567325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5540364384651184, "rewards/mean_confidence_reward": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02213541666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 532.0306396484375, "completions/mean_terminated_length": 451.3548278808594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0256, "grad_norm": 0.003787894267588854, "learning_rate": 1.25e-06, "loss": 0.0197, "num_tokens": 4548567.0, "reward": 0.4485677182674408, "reward_std": 0.35836929082870483, "rewards/accuracy_reward": 0.2838541567325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.61328125, "rewards/mean_confidence_reward": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02278645833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 584.3060302734375, "completions/mean_terminated_length": 502.4210510253906, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.032, "grad_norm": 0.005179207772016525, "learning_rate": 1.5625e-06, "loss": 0.0199, "num_tokens": 5699869.0, "reward": 0.4384765625, "reward_std": 0.34911227226257324, "rewards/accuracy_reward": 0.2630208432674408, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.6139323115348816, "rewards/mean_confidence_reward": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02669270833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3892.0, "completions/mean_length": 575.130859375, "completions/mean_terminated_length": 478.5718994140625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0384, "grad_norm": 0.011147035285830498, "learning_rate": 1.8750000000000003e-06, "loss": 0.0203, "num_tokens": 6838910.0, "reward": 0.4508463740348816, "reward_std": 0.3353247046470642, "rewards/accuracy_reward": 0.25390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.6477864384651184, "rewards/mean_confidence_reward": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01822916666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 502.00390625, "completions/mean_terminated_length": 435.2718811035156, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0448, "grad_norm": 0.01325953472405672, "learning_rate": 2.1875000000000002e-06, "loss": 0.023, "num_tokens": 7867508.0, "reward": 0.564453125, "reward_std": 0.2995663285255432, "rewards/accuracy_reward": 0.328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.80078125, "rewards/mean_confidence_reward": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01822916666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4003.0, "completions/mean_length": 463.3372497558594, "completions/mean_terminated_length": 395.88726806640625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0512, "grad_norm": 0.0010683703003451228, "learning_rate": 2.5e-06, "loss": 0.0198, "num_tokens": 8829290.0, "reward": 0.6012369990348816, "reward_std": 0.26198610663414, "rewards/accuracy_reward": 0.3209635317325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.8815104365348816, "rewards/mean_confidence_reward": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3628.0, "completions/mean_length": 421.8196716308594, "completions/mean_terminated_length": 380.6997985839844, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0576, "grad_norm": 0.0007295631221495569, "learning_rate": 2.8125e-06, "loss": 0.0142, "num_tokens": 9735477.0, "reward": 0.6845703125, "reward_std": 0.24511408805847168, "rewards/accuracy_reward": 0.4322916567325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9368489384651184, "rewards/mean_confidence_reward": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3913.0, "completions/mean_length": 404.9270935058594, "completions/mean_terminated_length": 363.6181640625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.064, "grad_norm": 0.0006488566286861897, "learning_rate": 3.125e-06, "loss": 0.003, "num_tokens": 10619397.0, "reward": 0.6578776240348816, "reward_std": 0.2086045742034912, "rewards/accuracy_reward": 0.3502604067325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9654948115348816, "rewards/mean_confidence_reward": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01236979166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3736.0, "completions/mean_length": 388.5774841308594, "completions/mean_terminated_length": 342.1430358886719, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0704, "grad_norm": 0.0006107186200097203, "learning_rate": 3.4375e-06, "loss": 0.0069, "num_tokens": 11470260.0, "reward": 0.6689453125, "reward_std": 0.19426743686199188, "rewards/accuracy_reward": 0.36328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.974609375, "rewards/mean_confidence_reward": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3337.0, "completions/mean_length": 397.7044372558594, "completions/mean_terminated_length": 358.7750244140625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0768, "grad_norm": 0.0006395940436050296, "learning_rate": 3.7500000000000005e-06, "loss": 0.0029, "num_tokens": 12329606.0, "reward": 0.6666666865348816, "reward_std": 0.19114232063293457, "rewards/accuracy_reward": 0.3515625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9817708134651184, "rewards/mean_confidence_reward": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3702.0, "completions/mean_length": 359.02734375, "completions/mean_terminated_length": 341.91888427734375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.0832, "grad_norm": 0.000690784421749413, "learning_rate": 4.0625000000000005e-06, "loss": 0.0011, "num_tokens": 13131600.0, "reward": 0.7041015625, "reward_std": 0.18511804938316345, "rewards/accuracy_reward": 0.4186197817325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 398.13671875, "completions/mean_terminated_length": 381.2073059082031, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0896, "grad_norm": 0.0004906603717245162, "learning_rate": 4.3750000000000005e-06, "loss": -0.001, "num_tokens": 13995730.0, "reward": 0.7259114980697632, "reward_std": 0.18286210298538208, "rewards/accuracy_reward": 0.4576822817325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 436.1712341308594, "completions/mean_terminated_length": 404.9317321777344, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.096, "grad_norm": 0.00048343793605454266, "learning_rate": 4.6875000000000004e-06, "loss": 0.0086, "num_tokens": 14925649.0, "reward": 0.7399088740348816, "reward_std": 0.1765557825565338, "rewards/accuracy_reward": 0.4928385317325592, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.0, "step": 15 }, { "epoch": 0.096, "eval_completions/clipped_ratio": 0.0068359375, "eval_completions/max_length": 3393.75, "eval_completions/max_terminated_length": 1931.25, "eval_completions/mean_length": 430.47265625, "eval_completions/mean_terminated_length": 405.194766998291, "eval_completions/min_length": 71.0, "eval_completions/min_terminated_length": 71.0, "eval_loss": 0.0, "eval_num_tokens": 14925649.0, "eval_reward": 0.73095703125, "eval_reward_std": 0.2545679435133934, "eval_rewards/accuracy_reward": 0.4697265625, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9921875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 219.6084, "eval_samples_per_second": 4.554, "eval_steps_per_second": 0.036, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 426.626953125, "completions/mean_terminated_length": 405.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1024, "grad_norm": 0.003415175247937441, "learning_rate": 5e-06, "loss": 0.0059, "num_tokens": 15840852.0, "reward": 0.7454427480697632, "reward_std": 0.18521976470947266, "rewards/accuracy_reward": 0.5013020634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 487.5833435058594, "completions/mean_terminated_length": 437.5657043457031, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.1088, "grad_norm": 0.014726120978593826, "learning_rate": 4.919354838709678e-06, "loss": 0.0066, "num_tokens": 16847004.0, "reward": 0.7158203125, "reward_std": 0.17167815566062927, "rewards/accuracy_reward": 0.44921875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.982421875, "rewards/mean_confidence_reward": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3088.0, "completions/mean_length": 502.166015625, "completions/mean_terminated_length": 476.2432861328125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.1152, "grad_norm": 0.0003558692696969956, "learning_rate": 4.838709677419355e-06, "loss": 0.0032, "num_tokens": 17866827.0, "reward": 0.7649739980697632, "reward_std": 0.1702839434146881, "rewards/accuracy_reward": 0.5390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9908854365348816, "rewards/mean_confidence_reward": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01236979166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 523.8353271484375, "completions/mean_terminated_length": 479.09490966796875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.1216, "grad_norm": 0.000376895593944937, "learning_rate": 4.758064516129033e-06, "loss": 0.0101, "num_tokens": 18925582.0, "reward": 0.755859375, "reward_std": 0.14837318658828735, "rewards/accuracy_reward": 0.5279948115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3952.0, "completions/mean_length": 555.5970458984375, "completions/mean_terminated_length": 513.615966796875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.128, "grad_norm": 0.00034090346889570355, "learning_rate": 4.67741935483871e-06, "loss": 0.0081, "num_tokens": 20030195.0, "reward": 0.7639974355697632, "reward_std": 0.1521807610988617, "rewards/accuracy_reward": 0.5423176884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01302083333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4010.0, "completions/mean_length": 614.029296875, "completions/mean_terminated_length": 568.093017578125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.1344, "grad_norm": 0.00031957231112755835, "learning_rate": 4.596774193548387e-06, "loss": 0.0125, "num_tokens": 21227192.0, "reward": 0.7877604365348816, "reward_std": 0.15472644567489624, "rewards/accuracy_reward": 0.58984375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9856770634651184, "rewards/mean_confidence_reward": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3994.0, "completions/mean_length": 574.828125, "completions/mean_terminated_length": 540.1026000976562, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.1408, "grad_norm": 0.0002684423525352031, "learning_rate": 4.516129032258065e-06, "loss": 0.0087, "num_tokens": 22359008.0, "reward": 0.7802734375, "reward_std": 0.13622406125068665, "rewards/accuracy_reward": 0.57421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.986328125, "rewards/mean_confidence_reward": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01888020833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 671.7708740234375, "completions/mean_terminated_length": 605.8765869140625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.1472, "grad_norm": 0.00025072344578802586, "learning_rate": 4.435483870967742e-06, "loss": 0.0164, "num_tokens": 23650904.0, "reward": 0.7786458730697632, "reward_std": 0.16046416759490967, "rewards/accuracy_reward": 0.5807291865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9765625, "rewards/mean_confidence_reward": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3210.0, "completions/mean_length": 593.3073120117188, "completions/mean_terminated_length": 570.3538818359375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.1536, "grad_norm": 0.0002969923953060061, "learning_rate": 4.35483870967742e-06, "loss": 0.0055, "num_tokens": 24813128.0, "reward": 0.8098958730697632, "reward_std": 0.12023092806339264, "rewards/accuracy_reward": 0.6263020634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02083333333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 643.1803588867188, "completions/mean_terminated_length": 569.716064453125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.16, "grad_norm": 0.0002521358255762607, "learning_rate": 4.274193548387097e-06, "loss": 0.0127, "num_tokens": 26048253.0, "reward": 0.7884114980697632, "reward_std": 0.14166441559791565, "rewards/accuracy_reward": 0.6002604365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9765625, "rewards/mean_confidence_reward": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4047.0, "completions/mean_length": 626.1217651367188, "completions/mean_terminated_length": 564.0364990234375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1664, "grad_norm": 0.0002549105847720057, "learning_rate": 4.193548387096774e-06, "loss": 0.0126, "num_tokens": 27265200.0, "reward": 0.77734375, "reward_std": 0.13510474562644958, "rewards/accuracy_reward": 0.5735676884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9811198115348816, "rewards/mean_confidence_reward": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4089.0, "completions/mean_length": 604.61328125, "completions/mean_terminated_length": 542.1431884765625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.1728, "grad_norm": 0.0002475589863024652, "learning_rate": 4.112903225806452e-06, "loss": 0.0144, "num_tokens": 28439614.0, "reward": 0.8125, "reward_std": 0.13454118371009827, "rewards/accuracy_reward": 0.64453125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98046875, "rewards/mean_confidence_reward": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4020.0, "completions/mean_length": 641.990234375, "completions/mean_terminated_length": 603.3344116210938, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1792, "grad_norm": 0.0002359977224841714, "learning_rate": 4.032258064516129e-06, "loss": 0.0105, "num_tokens": 29677791.0, "reward": 0.8151041865348816, "reward_std": 0.13174818456172943, "rewards/accuracy_reward": 0.6438801884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.986328125, "rewards/mean_confidence_reward": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01888020833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3084.0, "completions/mean_length": 661.5521240234375, "completions/mean_terminated_length": 595.461181640625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.1856, "grad_norm": 0.0003096568398177624, "learning_rate": 3.951612903225807e-06, "loss": 0.0154, "num_tokens": 30938983.0, "reward": 0.7978515625, "reward_std": 0.12398765981197357, "rewards/accuracy_reward": 0.615234375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98046875, "rewards/mean_confidence_reward": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3647.0, "completions/mean_length": 658.087890625, "completions/mean_terminated_length": 608.1314697265625, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.192, "grad_norm": 0.0002497898240108043, "learning_rate": 3.870967741935484e-06, "loss": 0.0098, "num_tokens": 32198790.0, "reward": 0.787109375, "reward_std": 0.12306138128042221, "rewards/accuracy_reward": 0.5904948115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.0, "step": 30 }, { "epoch": 0.192, "eval_completions/clipped_ratio": 0.015850360576923073, "eval_completions/max_length": 4096.0, "eval_completions/max_terminated_length": 2561.25, "eval_completions/mean_length": 643.6061477661133, "eval_completions/mean_terminated_length": 587.8973083496094, "eval_completions/min_length": 166.5, "eval_completions/min_terminated_length": 166.5, "eval_loss": 0.0, "eval_num_tokens": 32198790.0, "eval_reward": 0.80517578125, "eval_reward_std": 0.25961557030677795, "eval_rewards/accuracy_reward": 0.6298828125, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.98046875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 272.2745, "eval_samples_per_second": 3.673, "eval_steps_per_second": 0.029, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01302083333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3641.0, "completions/mean_length": 638.5384521484375, "completions/mean_terminated_length": 592.9254760742188, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.1984, "grad_norm": 0.0002642054751049727, "learning_rate": 3.7903225806451614e-06, "loss": 0.0083, "num_tokens": 33435001.0, "reward": 0.8219401240348816, "reward_std": 0.12539035081863403, "rewards/accuracy_reward": 0.66015625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9837239384651184, "rewards/mean_confidence_reward": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01432291666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3878.0, "completions/mean_length": 673.169921875, "completions/mean_terminated_length": 623.4326782226562, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.2048, "grad_norm": 0.00023612201039213687, "learning_rate": 3.7096774193548392e-06, "loss": 0.0122, "num_tokens": 34716198.0, "reward": 0.8108724355697632, "reward_std": 0.135454922914505, "rewards/accuracy_reward": 0.6373698115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.984375, "rewards/mean_confidence_reward": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01236979166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 600.5631713867188, "completions/mean_terminated_length": 556.7837524414062, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.2112, "grad_norm": 0.0002798466884996742, "learning_rate": 3.6290322580645166e-06, "loss": 0.0093, "num_tokens": 35888359.0, "reward": 0.8180338740348816, "reward_std": 0.1216612458229065, "rewards/accuracy_reward": 0.6490885615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9869791865348816, "rewards/mean_confidence_reward": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3657.0, "completions/mean_length": 640.6198120117188, "completions/mean_terminated_length": 599.6469116210938, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.2176, "grad_norm": 0.00022835738491266966, "learning_rate": 3.548387096774194e-06, "loss": 0.0067, "num_tokens": 37122367.0, "reward": 0.8043619990348816, "reward_std": 0.11845268309116364, "rewards/accuracy_reward": 0.6204426884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 613.96484375, "completions/mean_terminated_length": 577.3118896484375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.224, "grad_norm": 0.00022982119116932154, "learning_rate": 3.4677419354838714e-06, "loss": 0.0074, "num_tokens": 38318185.0, "reward": 0.8128255605697632, "reward_std": 0.11442983150482178, "rewards/accuracy_reward": 0.63671875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 633.0618896484375, "completions/mean_terminated_length": 612.651611328125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2304, "grad_norm": 0.0002401992242084816, "learning_rate": 3.3870967741935484e-06, "loss": 0.004, "num_tokens": 39545168.0, "reward": 0.8138021230697632, "reward_std": 0.11124514043331146, "rewards/accuracy_reward": 0.634765625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01106770833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3188.0, "completions/mean_length": 644.42578125, "completions/mean_terminated_length": 605.7972412109375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.2368, "grad_norm": 0.0002481676929164678, "learning_rate": 3.306451612903226e-06, "loss": 0.008, "num_tokens": 40785846.0, "reward": 0.8229166865348816, "reward_std": 0.11275224387645721, "rewards/accuracy_reward": 0.6575520634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01692708333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3881.0, "completions/mean_length": 656.0540771484375, "completions/mean_terminated_length": 596.8231811523438, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.2432, "grad_norm": 0.00023255293490365148, "learning_rate": 3.225806451612903e-06, "loss": 0.0092, "num_tokens": 42046369.0, "reward": 0.81640625, "reward_std": 0.11946391314268112, "rewards/accuracy_reward": 0.650390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.982421875, "rewards/mean_confidence_reward": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01627604166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3904.0, "completions/mean_length": 680.505859375, "completions/mean_terminated_length": 623.995361328125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2496, "grad_norm": 0.0002798327477648854, "learning_rate": 3.145161290322581e-06, "loss": 0.0105, "num_tokens": 43341162.0, "reward": 0.8089193105697632, "reward_std": 0.13103950023651123, "rewards/accuracy_reward": 0.63671875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9811198115348816, "rewards/mean_confidence_reward": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3231.0, "completions/mean_length": 649.806640625, "completions/mean_terminated_length": 618.1070556640625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.256, "grad_norm": 0.00021782911790069193, "learning_rate": 3.0645161290322584e-06, "loss": 0.0071, "num_tokens": 44593809.0, "reward": 0.8167318105697632, "reward_std": 0.10512678325176239, "rewards/accuracy_reward": 0.6438801884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9895833134651184, "rewards/mean_confidence_reward": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3253.0, "completions/mean_length": 646.7916870117188, "completions/mean_terminated_length": 617.3499755859375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.2624, "grad_norm": 0.00023386659449897707, "learning_rate": 2.983870967741936e-06, "loss": 0.0068, "num_tokens": 45839065.0, "reward": 0.833984375, "reward_std": 0.11914779245853424, "rewards/accuracy_reward": 0.6764323115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 654.45703125, "completions/mean_terminated_length": 592.8787231445312, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.2688, "grad_norm": 0.0002567242190707475, "learning_rate": 2.903225806451613e-06, "loss": 0.0088, "num_tokens": 47095759.0, "reward": 0.8098958730697632, "reward_std": 0.11656266450881958, "rewards/accuracy_reward": 0.6380208134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9817708134651184, "rewards/mean_confidence_reward": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3825.0, "completions/mean_length": 653.9583740234375, "completions/mean_terminated_length": 631.4024047851562, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.2752, "grad_norm": 0.00028055167058482766, "learning_rate": 2.822580645161291e-06, "loss": 0.0062, "num_tokens": 48353599.0, "reward": 0.8131510615348816, "reward_std": 0.12230964004993439, "rewards/accuracy_reward": 0.6334635615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 624.6744995117188, "completions/mean_terminated_length": 597.3411865234375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2816, "grad_norm": 0.00026211561635136604, "learning_rate": 2.7419354838709676e-06, "loss": 0.0065, "num_tokens": 49565811.0, "reward": 0.8356119990348816, "reward_std": 0.11118366569280624, "rewards/accuracy_reward": 0.6796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3697.0, "completions/mean_length": 641.7838745117188, "completions/mean_terminated_length": 619.1481323242188, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.288, "grad_norm": 0.0002798437199089676, "learning_rate": 2.6612903225806454e-06, "loss": 0.0046, "num_tokens": 50801927.0, "reward": 0.810546875, "reward_std": 0.10706674307584763, "rewards/accuracy_reward": 0.6282551884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 45 }, { "epoch": 0.288, "eval_completions/clipped_ratio": 0.010967548076923073, "eval_completions/max_length": 3628.5, "eval_completions/max_terminated_length": 2420.75, "eval_completions/mean_length": 656.3849182128906, "eval_completions/mean_terminated_length": 618.1777877807617, "eval_completions/min_length": 171.125, "eval_completions/min_terminated_length": 171.125, "eval_loss": 0.0, "eval_num_tokens": 50801927.0, "eval_reward": 0.81494140625, "eval_reward_std": 0.2401072047650814, "eval_rewards/accuracy_reward": 0.6416015625, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.98828125, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 239.4014, "eval_samples_per_second": 4.177, "eval_steps_per_second": 0.033, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3457.0, "completions/mean_length": 617.0084838867188, "completions/mean_terminated_length": 582.6989135742188, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.2944, "grad_norm": 0.0002833885373547673, "learning_rate": 2.580645161290323e-06, "loss": 0.0067, "num_tokens": 52007356.0, "reward": 0.826171875, "reward_std": 0.11079498380422592, "rewards/accuracy_reward": 0.662109375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3737.0, "completions/mean_length": 685.18359375, "completions/mean_terminated_length": 637.9049682617188, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.3008, "grad_norm": 0.00024410766491200775, "learning_rate": 2.5e-06, "loss": 0.0087, "num_tokens": 53313750.0, "reward": 0.8173828125, "reward_std": 0.1166023537516594, "rewards/accuracy_reward": 0.6484375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.986328125, "rewards/mean_confidence_reward": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 689.576171875, "completions/mean_terminated_length": 649.1837768554688, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.3072, "grad_norm": 0.00028568808920681477, "learning_rate": 2.4193548387096776e-06, "loss": 0.0072, "num_tokens": 54632059.0, "reward": 0.7998046875, "reward_std": 0.12417584657669067, "rewards/accuracy_reward": 0.61328125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.986328125, "rewards/mean_confidence_reward": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3823.0, "completions/mean_length": 627.0716552734375, "completions/mean_terminated_length": 613.468017578125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.3136, "grad_norm": 0.00025566882686689496, "learning_rate": 2.338709677419355e-06, "loss": 0.0036, "num_tokens": 55851281.0, "reward": 0.8310546875, "reward_std": 0.10658612847328186, "rewards/accuracy_reward": 0.6686198115348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 622.958984375, "completions/mean_terminated_length": 600.1998901367188, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.32, "grad_norm": 0.00041423627408221364, "learning_rate": 2.2580645161290324e-06, "loss": 0.0078, "num_tokens": 57059514.0, "reward": 0.8235677480697632, "reward_std": 0.11244156956672668, "rewards/accuracy_reward": 0.654296875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2564.0, "completions/mean_length": 636.1920776367188, "completions/mean_terminated_length": 595.1666870117188, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.3264, "grad_norm": 0.00027631345437839627, "learning_rate": 2.17741935483871e-06, "loss": 0.0093, "num_tokens": 58286153.0, "reward": 0.8111979365348816, "reward_std": 0.12525992095470428, "rewards/accuracy_reward": 0.6341145634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 617.8568115234375, "completions/mean_terminated_length": 581.2447509765625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.3328, "grad_norm": 0.00026788286049850285, "learning_rate": 2.096774193548387e-06, "loss": 0.0075, "num_tokens": 59491837.0, "reward": 0.8157552480697632, "reward_std": 0.10741822421550751, "rewards/accuracy_reward": 0.642578125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9889323115348816, "rewards/mean_confidence_reward": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2485.0, "completions/mean_length": 622.8698120117188, "completions/mean_terminated_length": 616.0730590820312, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.3392, "grad_norm": 0.000268914649495855, "learning_rate": 2.0161290322580646e-06, "loss": 0.0051, "num_tokens": 60697301.0, "reward": 0.8232421875, "reward_std": 0.10318265855312347, "rewards/accuracy_reward": 0.6484375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.998046875, "rewards/mean_confidence_reward": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 623.26953125, "completions/mean_terminated_length": 598.2203369140625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.3456, "grad_norm": 0.0002533692750148475, "learning_rate": 1.935483870967742e-06, "loss": 0.0066, "num_tokens": 61905635.0, "reward": 0.8434244990348816, "reward_std": 0.10225149244070053, "rewards/accuracy_reward": 0.6946614384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3843.0, "completions/mean_length": 648.0703125, "completions/mean_terminated_length": 632.28515625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.352, "grad_norm": 0.0002154428220819682, "learning_rate": 1.8548387096774196e-06, "loss": 0.004, "num_tokens": 63155519.0, "reward": 0.8271484375, "reward_std": 0.09687401354312897, "rewards/accuracy_reward": 0.6595051884651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9947916865348816, "rewards/mean_confidence_reward": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3433.0, "completions/mean_length": 703.6119995117188, "completions/mean_terminated_length": 679.1422729492188, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3584, "grad_norm": 0.0002669694658834487, "learning_rate": 1.774193548387097e-06, "loss": 0.0069, "num_tokens": 64491107.0, "reward": 0.8330078125, "reward_std": 0.11356484144926071, "rewards/accuracy_reward": 0.6731770634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 618.265625, "completions/mean_terminated_length": 602.343994140625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.3648, "grad_norm": 0.0002474285429343581, "learning_rate": 1.6935483870967742e-06, "loss": 0.0065, "num_tokens": 65693683.0, "reward": 0.8564453125, "reward_std": 0.10922061651945114, "rewards/accuracy_reward": 0.71875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3299.0, "completions/mean_length": 657.6087646484375, "completions/mean_terminated_length": 632.8071899414062, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.3712, "grad_norm": 0.0002707333769649267, "learning_rate": 1.6129032258064516e-06, "loss": 0.0061, "num_tokens": 66959650.0, "reward": 0.8492838740348816, "reward_std": 0.11089633405208588, "rewards/accuracy_reward": 0.7057291865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3742.0, "completions/mean_length": 635.080078125, "completions/mean_terminated_length": 614.6817626953125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.3776, "grad_norm": 0.00023831303406041116, "learning_rate": 1.5322580645161292e-06, "loss": 0.0042, "num_tokens": 68185781.0, "reward": 0.8180338740348816, "reward_std": 0.0983486920595169, "rewards/accuracy_reward": 0.6419270634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 638.1764526367188, "completions/mean_terminated_length": 613.2347412109375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.384, "grad_norm": 0.00025369905051775277, "learning_rate": 1.4516129032258066e-06, "loss": 0.0069, "num_tokens": 69419524.0, "reward": 0.8206380605697632, "reward_std": 0.10652142763137817, "rewards/accuracy_reward": 0.6490885615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9921875, "rewards/mean_confidence_reward": 0.0, "step": 60 }, { "epoch": 0.384, "eval_completions/clipped_ratio": 0.004131610576923073, "eval_completions/max_length": 3169.875, "eval_completions/max_terminated_length": 2246.5, "eval_completions/mean_length": 643.4401321411133, "eval_completions/mean_terminated_length": 629.0936508178711, "eval_completions/min_length": 188.5, "eval_completions/min_terminated_length": 188.5, "eval_loss": 0.0, "eval_num_tokens": 69419524.0, "eval_reward": 0.82373046875, "eval_reward_std": 0.23457887582480907, "eval_rewards/accuracy_reward": 0.6513671875, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.99609375, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 209.489, "eval_samples_per_second": 4.774, "eval_steps_per_second": 0.038, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 673.5443115234375, "completions/mean_terminated_length": 648.8577270507812, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.3904, "grad_norm": 0.00026515661738812923, "learning_rate": 1.3709677419354838e-06, "loss": 0.0045, "num_tokens": 70716608.0, "reward": 0.8011068105697632, "reward_std": 0.10466423630714417, "rewards/accuracy_reward": 0.6106770634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 686.5358276367188, "completions/mean_terminated_length": 657.433349609375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.3968, "grad_norm": 0.00023534742649644613, "learning_rate": 1.2903225806451614e-06, "loss": 0.0055, "num_tokens": 72025423.0, "reward": 0.826171875, "reward_std": 0.10892236232757568, "rewards/accuracy_reward": 0.662109375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3982.0, "completions/mean_length": 629.6354370117188, "completions/mean_terminated_length": 613.7658081054688, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.4032, "grad_norm": 0.00023829005658626556, "learning_rate": 1.2096774193548388e-06, "loss": 0.001, "num_tokens": 73244799.0, "reward": 0.8430989980697632, "reward_std": 0.09954534471035004, "rewards/accuracy_reward": 0.6927083134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 669.8873901367188, "completions/mean_terminated_length": 640.642822265625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.4096, "grad_norm": 0.00023246412456501275, "learning_rate": 1.1290322580645162e-06, "loss": 0.007, "num_tokens": 74524818.0, "reward": 0.8408203125, "reward_std": 0.11024834215641022, "rewards/accuracy_reward": 0.6901041865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3901.0, "completions/mean_length": 669.7200927734375, "completions/mean_terminated_length": 645.0059204101562, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.416, "grad_norm": 0.0002526069583836943, "learning_rate": 1.0483870967741936e-06, "loss": 0.005, "num_tokens": 75811244.0, "reward": 0.8414713740348816, "reward_std": 0.10166700929403305, "rewards/accuracy_reward": 0.6901041865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3167.0, "completions/mean_length": 673.7623901367188, "completions/mean_terminated_length": 651.336181640625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.4224, "grad_norm": 0.00027628280804492533, "learning_rate": 9.67741935483871e-07, "loss": 0.0066, "num_tokens": 77098391.0, "reward": 0.8313802480697632, "reward_std": 0.11165839433670044, "rewards/accuracy_reward": 0.6692708134651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9934895634651184, "rewards/mean_confidence_reward": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3942.0, "completions/mean_length": 667.787109375, "completions/mean_terminated_length": 633.9783325195312, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.4288, "grad_norm": 0.00024809566093608737, "learning_rate": 8.870967741935485e-07, "loss": 0.0071, "num_tokens": 78374368.0, "reward": 0.8408203125, "reward_std": 0.11645618081092834, "rewards/accuracy_reward": 0.69140625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00651041666666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4054.0, "completions/mean_length": 649.001953125, "completions/mean_terminated_length": 626.4135131835938, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4352, "grad_norm": 0.000251438410487026, "learning_rate": 8.064516129032258e-07, "loss": 0.0059, "num_tokens": 79626331.0, "reward": 0.8284505605697632, "reward_std": 0.10573884099721909, "rewards/accuracy_reward": 0.6653645634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3946.0, "completions/mean_length": 653.7532958984375, "completions/mean_terminated_length": 612.9360961914062, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.4416, "grad_norm": 0.00026661829906515777, "learning_rate": 7.258064516129033e-07, "loss": 0.0082, "num_tokens": 80884856.0, "reward": 0.806640625, "reward_std": 0.10683833807706833, "rewards/accuracy_reward": 0.6256510615348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9876301884651184, "rewards/mean_confidence_reward": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 2801.0, "completions/mean_length": 655.6484375, "completions/mean_terminated_length": 639.89794921875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.448, "grad_norm": 0.0003659256035462022, "learning_rate": 6.451612903225807e-07, "loss": 0.0045, "num_tokens": 82146524.0, "reward": 0.8385416865348816, "reward_std": 0.11747505515813828, "rewards/accuracy_reward": 0.681640625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9954426884651184, "rewards/mean_confidence_reward": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 659.3053588867188, "completions/mean_terminated_length": 618.5540161132812, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.4544, "grad_norm": 0.0002962271682918072, "learning_rate": 5.645161290322581e-07, "loss": 0.0081, "num_tokens": 83408017.0, "reward": 0.8626302480697632, "reward_std": 0.10841435194015503, "rewards/accuracy_reward": 0.7369791865348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98828125, "rewards/mean_confidence_reward": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00455729166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 649.5560302734375, "completions/mean_terminated_length": 633.777587890625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.4608, "grad_norm": 0.00029511775937862694, "learning_rate": 4.838709677419355e-07, "loss": 0.0044, "num_tokens": 84660903.0, "reward": 0.8092448115348816, "reward_std": 0.12138192355632782, "rewards/accuracy_reward": 0.6243489384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.994140625, "rewards/mean_confidence_reward": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00846354166666663, "completions/max_length": 4096.0, "completions/max_terminated_length": 3994.0, "completions/mean_length": 662.8704833984375, "completions/mean_terminated_length": 633.5659790039062, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4672, "grad_norm": 0.00026064313715323806, "learning_rate": 4.032258064516129e-07, "loss": 0.0041, "num_tokens": 85924768.0, "reward": 0.8209635615348816, "reward_std": 0.10173972696065903, "rewards/accuracy_reward": 0.650390625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9915364384651184, "rewards/mean_confidence_reward": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 613.3997802734375, "completions/mean_terminated_length": 606.58447265625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.4736, "grad_norm": 0.00026255525881424546, "learning_rate": 3.2258064516129035e-07, "loss": 0.0037, "num_tokens": 87117086.0, "reward": 0.8444010615348816, "reward_std": 0.08728945255279541, "rewards/accuracy_reward": 0.69140625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9973958134651184, "rewards/mean_confidence_reward": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00911458333333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 667.43359375, "completions/mean_terminated_length": 635.8961791992188, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.48, "grad_norm": 0.0002573035017121583, "learning_rate": 2.4193548387096775e-07, "loss": 0.0061, "num_tokens": 88391336.0, "reward": 0.8186849355697632, "reward_std": 0.11762097477912903, "rewards/accuracy_reward": 0.6471354365348816, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.990234375, "rewards/mean_confidence_reward": 0.0, "step": 75 }, { "epoch": 0.48, "eval_completions/clipped_ratio": 0.0048828125, "eval_completions/max_length": 2728.25, "eval_completions/max_terminated_length": 2279.75, "eval_completions/mean_length": 641.2385101318359, "eval_completions/mean_terminated_length": 624.3703231811523, "eval_completions/min_length": 189.125, "eval_completions/min_terminated_length": 189.125, "eval_loss": 0.0, "eval_num_tokens": 88391336.0, "eval_reward": 0.8330078125, "eval_reward_std": 0.23016384057700634, "eval_rewards/accuracy_reward": 0.6708984375, "eval_rewards/brier_reward": 0.0, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9951171875, "eval_rewards/mean_confidence_reward": 0.0, "eval_runtime": 182.2902, "eval_samples_per_second": 5.486, "eval_steps_per_second": 0.044, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0032552083333333703, "completions/max_length": 4096.0, "completions/max_terminated_length": 3493.0, "completions/mean_length": 640.0716552734375, "completions/mean_terminated_length": 628.7850952148438, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.4864, "grad_norm": 0.00025482551427558064, "learning_rate": 1.6129032258064518e-07, "loss": 0.0043, "num_tokens": 89623294.0, "reward": 0.8557943105697632, "reward_std": 0.11010631918907166, "rewards/accuracy_reward": 0.71484375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9967448115348816, "rewards/mean_confidence_reward": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00716145833333337, "completions/max_length": 4096.0, "completions/max_terminated_length": 3253.0, "completions/mean_length": 644.4609375, "completions/mean_terminated_length": 619.5645751953125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.4928, "grad_norm": 0.00024977046996355057, "learning_rate": 8.064516129032259e-08, "loss": 0.0053, "num_tokens": 90863114.0, "reward": 0.8515625, "reward_std": 0.09298877418041229, "rewards/accuracy_reward": 0.7102864384651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9928385615348816, "rewards/mean_confidence_reward": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3669.0, "completions/mean_length": 647.9694213867188, "completions/mean_terminated_length": 634.44775390625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.4992, "grad_norm": 0.0002479875402059406, "learning_rate": 0.0, "loss": 0.0024, "num_tokens": 92113427.0, "reward": 0.8307291865348816, "reward_std": 0.11155258119106293, "rewards/accuracy_reward": 0.6653645634651184, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99609375, "rewards/mean_confidence_reward": 0.0, "step": 78 }, { "epoch": 0.4992, "step": 78, "total_flos": 0.0, "train_loss": 0.00840686316578052, "train_runtime": 15684.0191, "train_samples_per_second": 0.956, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 78, "num_input_tokens_seen": 92113427, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }