{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 50, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0400390625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1521.8, "completions/mean_length": 276.6455078125, "completions/mean_terminated_length": 224.1190185546875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.016, "grad_norm": 0.05407445505261421, "learning_rate": 3.1249999999999997e-07, "loss": 0.0956, "num_tokens": 17676882.0, "reward": 0.6326022028923035, "reward_std": 0.4947403073310852, "rewards/accuracy_reward": 0.2208984375, "rewards/brier_reward": 0.3710617899894714, "rewards/confidence_one_or_zero": 0.27548828125, "rewards/format_reward": 0.6732421875, "rewards/mean_confidence_reward": 0.7399574875831604, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.034765625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1498.2, "completions/mean_length": 258.75498046875, "completions/mean_terminated_length": 212.7857635498047, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.032, "grad_norm": 0.0354408323764801, "learning_rate": 6.249999999999999e-07, "loss": 0.0885, "num_tokens": 35426885.0, "reward": 0.6595678806304932, "reward_std": 0.46407333612442014, "rewards/accuracy_reward": 0.21484375, "rewards/brier_reward": 0.38378217816352844, "rewards/confidence_one_or_zero": 0.26513671875, "rewards/format_reward": 0.7205078125, "rewards/mean_confidence_reward": 0.7485471248626709, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0203125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1443.6, "completions/mean_length": 203.36826171875, "completions/mean_terminated_length": 175.87453918457032, "completions/min_length": 1.8, "completions/min_terminated_length": 1.8, "epoch": 0.048, "grad_norm": 0.030709726735949516, "learning_rate": 9.374999999999999e-07, "loss": 0.0683, "num_tokens": 52558112.0, "reward": 0.8185818791389465, "reward_std": 0.37540732622146605, "rewards/accuracy_reward": 0.2767578125, "rewards/brier_reward": 0.4851109802722931, "rewards/confidence_one_or_zero": 0.26318359375, "rewards/format_reward": 0.87529296875, "rewards/mean_confidence_reward": 0.763912582397461, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0048828125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1206.6, "completions/mean_length": 136.7234375, "completions/mean_terminated_length": 129.86282653808593, "completions/min_length": 7.4, "completions/min_terminated_length": 7.4, "epoch": 0.064, "grad_norm": 0.030825674533843994, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 68876560.0, "reward": 0.944485855102539, "reward_std": 0.2918001413345337, "rewards/accuracy_reward": 0.336328125, "rewards/brier_reward": 0.5843800187110901, "rewards/confidence_one_or_zero": 0.20009765625, "rewards/format_reward": 0.96826171875, "rewards/mean_confidence_reward": 0.7416761994361878, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00166015625, "completions/max_length": 1536.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 121.93955078125, "completions/mean_terminated_length": 119.58839721679688, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.08, "grad_norm": 0.07219453901052475, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 85058373.0, "reward": 1.0135140299797059, "reward_std": 0.22123381197452546, "rewards/accuracy_reward": 0.3626953125, "rewards/brier_reward": 0.6754642128944397, "rewards/confidence_one_or_zero": 0.083203125, "rewards/format_reward": 0.9888671875, "rewards/mean_confidence_reward": 0.6404195070266724, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 1187.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 122.44267578125, "completions/mean_terminated_length": 121.89026336669922, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.096, "grad_norm": 0.0071799191646277905, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 101356794.0, "reward": 1.050560426712036, "reward_std": 0.16749218702316285, "rewards/accuracy_reward": 0.36787109375, "rewards/brier_reward": 0.7357877850532532, "rewards/confidence_one_or_zero": 0.04443359375, "rewards/format_reward": 0.9974609375, "rewards/mean_confidence_reward": 0.5114866554737091, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0005859375, "completions/max_length": 822.8, "completions/max_terminated_length": 457.6, "completions/mean_length": 125.12060546875, "completions/mean_terminated_length": 124.29327087402343, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 0.112, "grad_norm": 0.012783159501850605, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 117747501.0, "reward": 1.0780974626541138, "reward_std": 0.12303584218025207, "rewards/accuracy_reward": 0.40283203125, "rewards/brier_reward": 0.7566823959350586, "rewards/confidence_one_or_zero": 0.05263671875, "rewards/format_reward": 0.9966796875, "rewards/mean_confidence_reward": 0.3563989281654358, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 1302.2, "completions/max_terminated_length": 380.2, "completions/mean_length": 131.248828125, "completions/mean_terminated_length": 130.69945831298827, "completions/min_length": 44.2, "completions/min_terminated_length": 44.2, "epoch": 0.128, "grad_norm": 0.03782346472144127, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 134008161.0, "reward": 1.0803744792938232, "reward_std": 0.09947807043790817, "rewards/accuracy_reward": 0.40439453125, "rewards/brier_reward": 0.7587952256202698, "rewards/confidence_one_or_zero": 0.04990234375, "rewards/format_reward": 0.99755859375, "rewards/mean_confidence_reward": 0.30792068839073183, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 767.4, "completions/max_terminated_length": 526.6, "completions/mean_length": 134.51650390625, "completions/mean_terminated_length": 134.37986755371094, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.144, "grad_norm": 0.0044364649802446365, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 150336042.0, "reward": 1.1184379577636718, "reward_std": 0.10152655839920044, "rewards/accuracy_reward": 0.5048828125, "rewards/brier_reward": 0.7332618951797485, "rewards/confidence_one_or_zero": 0.03076171875, "rewards/format_reward": 0.99873046875, "rewards/mean_confidence_reward": 0.33715721368789675, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 596.4, "completions/max_terminated_length": 347.8, "completions/mean_length": 141.1640625, "completions/mean_terminated_length": 141.02822570800782, "completions/min_length": 53.6, "completions/min_terminated_length": 53.6, "epoch": 0.16, "grad_norm": 0.0031058751046657562, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 166802490.0, "reward": 1.1045508146286012, "reward_std": 0.10970858335494996, "rewards/accuracy_reward": 0.4513671875, "rewards/brier_reward": 0.7593937039375305, "rewards/confidence_one_or_zero": 0.01396484375, "rewards/format_reward": 0.99833984375, "rewards/mean_confidence_reward": 0.4095295906066895, "step": 50 }, { "epoch": 0.16, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 249.75, "eval_completions/max_terminated_length": 249.75, "eval_completions/mean_length": 143.74979782104492, "eval_completions/mean_terminated_length": 143.74979782104492, "eval_completions/min_length": 72.75, "eval_completions/min_terminated_length": 72.75, "eval_loss": 0.0, "eval_num_tokens": 166802490.0, "eval_reward": 1.0683082938194275, "eval_reward_std": 0.22595830261707306, "eval_rewards/accuracy_reward": 0.3671875, "eval_rewards/brier_reward": 0.7694281339645386, "eval_rewards/confidence_one_or_zero": 0.0078125, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.4460156336426735, "eval_runtime": 17.2028, "eval_samples_per_second": 29.065, "eval_steps_per_second": 0.233, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 147.52265625, "completions/mean_terminated_length": 147.52265625, "completions/min_length": 56.2, "completions/min_terminated_length": 56.2, "epoch": 0.176, "grad_norm": 0.008834286592900753, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 183550242.0, "reward": 1.105972409248352, "reward_std": 0.10724246203899383, "rewards/accuracy_reward": 0.44873046875, "rewards/brier_reward": 0.7641900300979614, "rewards/confidence_one_or_zero": 0.01337890625, "rewards/format_reward": 0.9990234375, "rewards/mean_confidence_reward": 0.4539414048194885, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 155.03759765625, "completions/mean_terminated_length": 155.03759765625, "completions/min_length": 68.2, "completions/min_terminated_length": 68.2, "epoch": 0.192, "grad_norm": 0.0015183566138148308, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 199952643.0, "reward": 1.1174006700515746, "reward_std": 0.10823124945163727, "rewards/accuracy_reward": 0.4720703125, "rewards/brier_reward": 0.7630230784416199, "rewards/confidence_one_or_zero": 0.00830078125, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.4763046860694885, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.8, "completions/max_terminated_length": 429.8, "completions/mean_length": 164.254296875, "completions/mean_terminated_length": 164.254296875, "completions/min_length": 82.2, "completions/min_terminated_length": 82.2, "epoch": 0.208, "grad_norm": 0.0033636174630373716, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 216666831.0, "reward": 1.144743847846985, "reward_std": 0.11080079525709152, "rewards/accuracy_reward": 0.52763671875, "rewards/brier_reward": 0.7621429681777954, "rewards/confidence_one_or_zero": 0.008203125, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.48511279225349424, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 674.8, "completions/max_terminated_length": 462.6, "completions/mean_length": 168.61201171875, "completions/mean_terminated_length": 168.47879638671876, "completions/min_length": 77.4, "completions/min_terminated_length": 77.4, "epoch": 0.224, "grad_norm": 0.0016244107391685247, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 233546602.0, "reward": 1.1177247285842895, "reward_std": 0.10474657416343688, "rewards/accuracy_reward": 0.46884765625, "rewards/brier_reward": 0.7668938279151917, "rewards/confidence_one_or_zero": 0.00947265625, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.4841289222240448, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 651.0, "completions/max_terminated_length": 417.8, "completions/mean_length": 174.24892578125, "completions/mean_terminated_length": 174.11595458984374, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.24, "grad_norm": 0.002639307640492916, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 250582591.0, "reward": 1.1502854824066162, "reward_std": 0.12000200897455215, "rewards/accuracy_reward": 0.53818359375, "rewards/brier_reward": 0.7634606242179871, "rewards/confidence_one_or_zero": 0.005078125, "rewards/format_reward": 0.99892578125, "rewards/mean_confidence_reward": 0.489949232339859, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 843.8, "completions/max_terminated_length": 765.6, "completions/mean_length": 175.550390625, "completions/mean_terminated_length": 175.02010498046874, "completions/min_length": 81.6, "completions/min_terminated_length": 81.6, "epoch": 0.256, "grad_norm": 0.02150336280465126, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 267435043.0, "reward": 1.1393208265304566, "reward_std": 0.11671655029058456, "rewards/accuracy_reward": 0.5083984375, "rewards/brier_reward": 0.7709258198738098, "rewards/confidence_one_or_zero": 0.01162109375, "rewards/format_reward": 0.99931640625, "rewards/mean_confidence_reward": 0.48744922280311587, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 183.68974609375, "completions/mean_terminated_length": 183.68974609375, "completions/min_length": 79.4, "completions/min_terminated_length": 79.4, "epoch": 0.272, "grad_norm": 0.0036507430486381054, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 284281722.0, "reward": 1.1324650287628173, "reward_std": 0.11320338100194931, "rewards/accuracy_reward": 0.4955078125, "rewards/brier_reward": 0.7696167230606079, "rewards/confidence_one_or_zero": 0.01044921875, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.48797558546066283, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 675.0, "completions/max_terminated_length": 453.2, "completions/mean_length": 183.3435546875, "completions/mean_terminated_length": 183.211669921875, "completions/min_length": 71.6, "completions/min_terminated_length": 71.6, "epoch": 0.288, "grad_norm": 0.00159507489297539, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 301117336.0, "reward": 1.1380449295043946, "reward_std": 0.11923972368240357, "rewards/accuracy_reward": 0.50419921875, "rewards/brier_reward": 0.7722803950309753, "rewards/confidence_one_or_zero": 0.00966796875, "rewards/format_reward": 0.999609375, "rewards/mean_confidence_reward": 0.4882441520690918, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 729.4, "completions/max_terminated_length": 545.0, "completions/mean_length": 188.354296875, "completions/mean_terminated_length": 188.22288818359374, "completions/min_length": 74.2, "completions/min_terminated_length": 74.2, "epoch": 0.304, "grad_norm": 0.0020240871235728264, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 317976036.0, "reward": 1.1388062238693237, "reward_std": 0.11345189213752746, "rewards/accuracy_reward": 0.50478515625, "rewards/brier_reward": 0.7739005923271179, "rewards/confidence_one_or_zero": 0.01728515625, "rewards/format_reward": 0.99892578125, "rewards/mean_confidence_reward": 0.48295703530311584, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 662.4, "completions/max_terminated_length": 447.6, "completions/mean_length": 191.061328125, "completions/mean_terminated_length": 190.9298309326172, "completions/min_length": 86.8, "completions/min_terminated_length": 86.8, "epoch": 0.32, "grad_norm": 0.0013722889125347137, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 335021208.0, "reward": 1.1509589195251464, "reward_std": 0.1035462662577629, "rewards/accuracy_reward": 0.52197265625, "rewards/brier_reward": 0.7800418734550476, "rewards/confidence_one_or_zero": 0.01318359375, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.4988081157207489, "step": 100 }, { "epoch": 0.32, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 411.75, "eval_completions/max_terminated_length": 411.75, "eval_completions/mean_length": 195.53663635253906, "eval_completions/mean_terminated_length": 195.53663635253906, "eval_completions/min_length": 107.25, "eval_completions/min_terminated_length": 107.25, "eval_loss": 0.0, "eval_num_tokens": 335021208.0, "eval_reward": 1.0887417793273926, "eval_reward_std": 0.25174758210778236, "eval_rewards/accuracy_reward": 0.400390625, "eval_rewards/brier_reward": 0.7770919799804688, "eval_rewards/confidence_one_or_zero": 0.013671875, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.46802735328674316, "eval_runtime": 22.2738, "eval_samples_per_second": 22.448, "eval_steps_per_second": 0.18, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.000390625, "completions/max_length": 1125.0, "completions/max_terminated_length": 479.8, "completions/mean_length": 196.13671875, "completions/mean_terminated_length": 195.61272583007812, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.336, "grad_norm": 0.0013452547136694193, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 351752080.0, "reward": 1.1555601119995118, "reward_std": 0.11393154710531235, "rewards/accuracy_reward": 0.52939453125, "rewards/brier_reward": 0.7823106408119201, "rewards/confidence_one_or_zero": 0.01396484375, "rewards/format_reward": 0.9994140625, "rewards/mean_confidence_reward": 0.49622313380241395, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 908.0, "completions/max_terminated_length": 493.8, "completions/mean_length": 199.18369140625, "completions/mean_terminated_length": 198.923095703125, "completions/min_length": 97.6, "completions/min_terminated_length": 97.6, "epoch": 0.352, "grad_norm": 0.002751865889877081, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 369052137.0, "reward": 1.1255475521087646, "reward_std": 0.11049925088882447, "rewards/accuracy_reward": 0.4666015625, "rewards/brier_reward": 0.7846879482269287, "rewards/confidence_one_or_zero": 0.01533203125, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.4882112622261047, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.8, "completions/max_terminated_length": 627.8, "completions/mean_length": 203.8998046875, "completions/mean_terminated_length": 203.8998046875, "completions/min_length": 95.6, "completions/min_terminated_length": 95.6, "epoch": 0.368, "grad_norm": 0.002145805163308978, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 386205543.0, "reward": 1.1361007690429688, "reward_std": 0.1053838849067688, "rewards/accuracy_reward": 0.48994140625, "rewards/brier_reward": 0.7822591662406921, "rewards/confidence_one_or_zero": 0.01376953125, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5061289191246032, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.6, "completions/max_terminated_length": 659.6, "completions/mean_length": 201.56748046875, "completions/mean_terminated_length": 201.56748046875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.384, "grad_norm": 0.002008226700127125, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 403126106.0, "reward": 1.1584414958953857, "reward_std": 0.10435761213302612, "rewards/accuracy_reward": 0.521484375, "rewards/brier_reward": 0.7953975558280945, "rewards/confidence_one_or_zero": 0.01845703125, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5107519447803497, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.2, "completions/max_terminated_length": 486.2, "completions/mean_length": 201.58603515625, "completions/mean_terminated_length": 201.58603515625, "completions/min_length": 96.8, "completions/min_terminated_length": 96.8, "epoch": 0.4, "grad_norm": 0.004599791020154953, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 420226795.0, "reward": 1.1391048192977906, "reward_std": 0.11113806515932083, "rewards/accuracy_reward": 0.4962890625, "rewards/brier_reward": 0.781919538974762, "rewards/confidence_one_or_zero": 0.0142578125, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5150127053260803, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.2, "completions/max_terminated_length": 573.2, "completions/mean_length": 201.14990234375, "completions/mean_terminated_length": 201.14990234375, "completions/min_length": 98.6, "completions/min_terminated_length": 98.6, "epoch": 0.416, "grad_norm": 0.0012156119337305427, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 437167754.0, "reward": 1.150643491744995, "reward_std": 0.1118047833442688, "rewards/accuracy_reward": 0.50810546875, "rewards/brier_reward": 0.7934735059738159, "rewards/confidence_one_or_zero": 0.0142578125, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.4986997008323669, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 674.8, "completions/max_terminated_length": 482.4, "completions/mean_length": 200.10625, "completions/mean_terminated_length": 199.97588195800782, "completions/min_length": 95.4, "completions/min_terminated_length": 95.4, "epoch": 0.432, "grad_norm": 0.006859796121716499, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 454231178.0, "reward": 1.1733263969421386, "reward_std": 0.10346025228500366, "rewards/accuracy_reward": 0.54521484375, "rewards/brier_reward": 0.8016322970390319, "rewards/confidence_one_or_zero": 0.01435546875, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.5186787366867065, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 719.8, "completions/max_terminated_length": 518.8, "completions/mean_length": 208.76640625, "completions/mean_terminated_length": 208.63664855957032, "completions/min_length": 96.8, "completions/min_terminated_length": 96.8, "epoch": 0.448, "grad_norm": 0.0016618920490145683, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 471321746.0, "reward": 1.1534050226211547, "reward_std": 0.11015735268592834, "rewards/accuracy_reward": 0.51103515625, "rewards/brier_reward": 0.7959691882133484, "rewards/confidence_one_or_zero": 0.0083984375, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.5418408274650574, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.8, "completions/max_terminated_length": 566.8, "completions/mean_length": 215.3693359375, "completions/mean_terminated_length": 215.3693359375, "completions/min_length": 99.2, "completions/min_terminated_length": 99.2, "epoch": 0.464, "grad_norm": 0.0014355273451656103, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 488697944.0, "reward": 1.1193523406982422, "reward_std": 0.10776209384202957, "rewards/accuracy_reward": 0.458984375, "rewards/brier_reward": 0.7799146294593811, "rewards/confidence_one_or_zero": 0.011328125, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.520960932970047, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 752.6, "completions/max_terminated_length": 530.4, "completions/mean_length": 216.18486328125, "completions/mean_terminated_length": 215.92719421386718, "completions/min_length": 108.8, "completions/min_terminated_length": 108.8, "epoch": 0.48, "grad_norm": 0.0018099879380315542, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 505959709.0, "reward": 1.1509707689285278, "reward_std": 0.10748694986104965, "rewards/accuracy_reward": 0.512109375, "rewards/brier_reward": 0.7900263905525208, "rewards/confidence_one_or_zero": 0.01904296875, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.4951982319355011, "step": 150 }, { "epoch": 0.48, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 374.5, "eval_completions/max_terminated_length": 374.5, "eval_completions/mean_length": 216.64392471313477, "eval_completions/mean_terminated_length": 216.64392471313477, "eval_completions/min_length": 125.0, "eval_completions/min_terminated_length": 125.0, "eval_loss": 0.0, "eval_num_tokens": 505959709.0, "eval_reward": 1.1084575355052948, "eval_reward_std": 0.2621918395161629, "eval_rewards/accuracy_reward": 0.41015625, "eval_rewards/brier_reward": 0.8067578077316284, "eval_rewards/confidence_one_or_zero": 0.0234375, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.4730468988418579, "eval_runtime": 21.7939, "eval_samples_per_second": 22.942, "eval_steps_per_second": 0.184, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.8, "completions/max_terminated_length": 588.8, "completions/mean_length": 218.93642578125, "completions/mean_terminated_length": 218.93642578125, "completions/min_length": 106.4, "completions/min_terminated_length": 106.4, "epoch": 0.496, "grad_norm": 0.0012288556899875402, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 523509458.0, "reward": 1.1674549341201783, "reward_std": 0.0982852265238762, "rewards/accuracy_reward": 0.5412109375, "rewards/brier_reward": 0.7936979055404663, "rewards/confidence_one_or_zero": 0.01201171875, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5015585958957672, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 729.2, "completions/max_terminated_length": 520.6, "completions/mean_length": 218.68642578125, "completions/mean_terminated_length": 218.5576599121094, "completions/min_length": 110.6, "completions/min_terminated_length": 110.6, "epoch": 0.512, "grad_norm": 0.0026432271115481853, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 540894471.0, "reward": 1.1772954702377318, "reward_std": 0.10269609093666077, "rewards/accuracy_reward": 0.54892578125, "rewards/brier_reward": 0.8057618141174316, "rewards/confidence_one_or_zero": 0.00927734375, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.5223603427410126, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00029296875, "completions/max_length": 1145.4, "completions/max_terminated_length": 564.4, "completions/mean_length": 220.1521484375, "completions/mean_terminated_length": 219.7665252685547, "completions/min_length": 113.2, "completions/min_terminated_length": 113.2, "epoch": 0.528, "grad_norm": 0.0011872347677126527, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 558178365.0, "reward": 1.1743324041366576, "reward_std": 0.09865072965621949, "rewards/accuracy_reward": 0.53583984375, "rewards/brier_reward": 0.8131169199943542, "rewards/confidence_one_or_zero": 0.0087890625, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.5171630859375, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 908.0, "completions/max_terminated_length": 513.6, "completions/mean_length": 224.51904296875, "completions/mean_terminated_length": 224.26331481933593, "completions/min_length": 103.4, "completions/min_terminated_length": 103.4, "epoch": 0.544, "grad_norm": 0.0013052740832790732, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 575641024.0, "reward": 1.180383038520813, "reward_std": 0.10921536087989807, "rewards/accuracy_reward": 0.56044921875, "rewards/brier_reward": 0.80060875415802, "rewards/confidence_one_or_zero": 0.00556640625, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.5579150438308715, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.4, "completions/max_terminated_length": 497.4, "completions/mean_length": 223.90068359375, "completions/mean_terminated_length": 223.90068359375, "completions/min_length": 108.6, "completions/min_terminated_length": 108.6, "epoch": 0.56, "grad_norm": 0.0013972694287076592, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 592755175.0, "reward": 1.1710703134536744, "reward_std": 0.10018587708473206, "rewards/accuracy_reward": 0.53154296875, "rewards/brier_reward": 0.8105966329574585, "rewards/confidence_one_or_zero": 0.00439453125, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5557578206062317, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 913.2, "completions/max_terminated_length": 470.2, "completions/mean_length": 222.59306640625, "completions/mean_terminated_length": 222.33611450195312, "completions/min_length": 112.2, "completions/min_terminated_length": 112.2, "epoch": 0.576, "grad_norm": 0.0014351216377690434, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 610221152.0, "reward": 1.1574892282485962, "reward_std": 0.09260296672582627, "rewards/accuracy_reward": 0.516796875, "rewards/brier_reward": 0.7984735131263733, "rewards/confidence_one_or_zero": 0.00673828125, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.5536435604095459, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.6, "completions/max_terminated_length": 476.6, "completions/mean_length": 223.04169921875, "completions/mean_terminated_length": 223.04169921875, "completions/min_length": 112.4, "completions/min_terminated_length": 112.4, "epoch": 0.592, "grad_norm": 0.002771401545032859, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 627672811.0, "reward": 1.1642754554748536, "reward_std": 0.09222442507743836, "rewards/accuracy_reward": 0.52421875, "rewards/brier_reward": 0.80452641248703, "rewards/confidence_one_or_zero": 0.008984375, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.5350224733352661, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 928.8, "completions/max_terminated_length": 539.6, "completions/mean_length": 225.85869140625, "completions/mean_terminated_length": 225.60252990722657, "completions/min_length": 113.2, "completions/min_terminated_length": 113.2, "epoch": 0.608, "grad_norm": 0.0009160145418718457, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 644985092.0, "reward": 1.1762210130691528, "reward_std": 0.0809452623128891, "rewards/accuracy_reward": 0.52841796875, "rewards/brier_reward": 0.8242184281349182, "rewards/confidence_one_or_zero": 0.01025390625, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.5095595717430115, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.2, "completions/max_terminated_length": 490.2, "completions/mean_length": 225.5041015625, "completions/mean_terminated_length": 225.5041015625, "completions/min_length": 107.4, "completions/min_terminated_length": 107.4, "epoch": 0.624, "grad_norm": 0.0012107606744393706, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 662638158.0, "reward": 1.1756139755249024, "reward_std": 0.09611473232507706, "rewards/accuracy_reward": 0.531640625, "rewards/brier_reward": 0.8195863008499146, "rewards/confidence_one_or_zero": 0.01044921875, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.515623027086258, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0001953125, "completions/max_length": 902.8, "completions/max_terminated_length": 522.0, "completions/mean_length": 229.905078125, "completions/mean_terminated_length": 229.64991760253906, "completions/min_length": 114.4, "completions/min_terminated_length": 114.4, "epoch": 0.64, "grad_norm": 0.0014958431711420417, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 680335074.0, "reward": 1.1860469579696655, "reward_std": 0.09416615813970566, "rewards/accuracy_reward": 0.562890625, "rewards/brier_reward": 0.8094952344894409, "rewards/confidence_one_or_zero": 0.01044921875, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.5647656202316285, "step": 200 }, { "epoch": 0.64, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 376.0, "eval_completions/max_terminated_length": 376.0, "eval_completions/mean_length": 231.17133712768555, "eval_completions/mean_terminated_length": 231.17133712768555, "eval_completions/min_length": 133.75, "eval_completions/min_terminated_length": 133.75, "eval_loss": 0.0, "eval_num_tokens": 680335074.0, "eval_reward": 1.113263338804245, "eval_reward_std": 0.29021773487329483, "eval_rewards/accuracy_reward": 0.43359375, "eval_rewards/brier_reward": 0.7929318398237228, "eval_rewards/confidence_one_or_zero": 0.009765625, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.5252539217472076, "eval_runtime": 22.36, "eval_samples_per_second": 22.361, "eval_steps_per_second": 0.179, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 718.8, "completions/max_terminated_length": 514.2, "completions/mean_length": 229.0859375, "completions/mean_terminated_length": 228.9575408935547, "completions/min_length": 106.6, "completions/min_terminated_length": 106.6, "epoch": 0.656, "grad_norm": 0.0010967873968183994, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 697537458.0, "reward": 1.1436767816543578, "reward_std": 0.10650671422481536, "rewards/accuracy_reward": 0.500390625, "rewards/brier_reward": 0.7871571063995362, "rewards/confidence_one_or_zero": 0.00869140625, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.5562148451805115, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.4, "completions/max_terminated_length": 501.4, "completions/mean_length": 230.0017578125, "completions/mean_terminated_length": 230.0017578125, "completions/min_length": 113.6, "completions/min_terminated_length": 113.6, "epoch": 0.672, "grad_norm": 0.0009973476408049464, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 714806116.0, "reward": 1.1648722887039185, "reward_std": 0.0889286831021309, "rewards/accuracy_reward": 0.51943359375, "rewards/brier_reward": 0.8103100538253785, "rewards/confidence_one_or_zero": 0.01181640625, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.534494137763977, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 771.2, "completions/max_terminated_length": 594.6, "completions/mean_length": 229.80078125, "completions/mean_terminated_length": 229.6729309082031, "completions/min_length": 108.2, "completions/min_terminated_length": 108.2, "epoch": 0.688, "grad_norm": 0.003850990440696478, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 732113196.0, "reward": 1.170497512817383, "reward_std": 0.09079546928405761, "rewards/accuracy_reward": 0.5337890625, "rewards/brier_reward": 0.8074003338813782, "rewards/confidence_one_or_zero": 0.00693359375, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.5142109453678131, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 224.29423828125, "completions/mean_terminated_length": 224.29423828125, "completions/min_length": 109.6, "completions/min_terminated_length": 109.6, "epoch": 0.704, "grad_norm": 0.000887486501596868, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 749276113.0, "reward": 1.1758257865905761, "reward_std": 0.08791445046663285, "rewards/accuracy_reward": 0.53662109375, "rewards/brier_reward": 0.8150294065475464, "rewards/confidence_one_or_zero": 0.010546875, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5334873080253602, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.8, "completions/max_terminated_length": 443.8, "completions/mean_length": 221.7052734375, "completions/mean_terminated_length": 221.7052734375, "completions/min_length": 106.2, "completions/min_terminated_length": 106.2, "epoch": 0.72, "grad_norm": 0.001327179721556604, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 766556231.0, "reward": 1.1897984266281127, "reward_std": 0.08169474899768829, "rewards/accuracy_reward": 0.56044921875, "rewards/brier_reward": 0.8191466093063354, "rewards/confidence_one_or_zero": 0.00361328125, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5215481758117676, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.6, "completions/max_terminated_length": 440.6, "completions/mean_length": 220.46484375, "completions/mean_terminated_length": 220.46484375, "completions/min_length": 108.6, "completions/min_terminated_length": 108.6, "epoch": 0.736, "grad_norm": 0.0015731732128188014, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 783753375.0, "reward": 1.1876837491989136, "reward_std": 0.08695107698440552, "rewards/accuracy_reward": 0.566015625, "rewards/brier_reward": 0.8093507885932922, "rewards/confidence_one_or_zero": 0.003515625, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5321669936180115, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 644.6, "completions/max_terminated_length": 438.8, "completions/mean_length": 220.48974609375, "completions/mean_terminated_length": 220.36173095703126, "completions/min_length": 106.8, "completions/min_terminated_length": 106.8, "epoch": 0.752, "grad_norm": 0.0010616250801831484, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 801238390.0, "reward": 1.1859095811843872, "reward_std": 0.08611637353897095, "rewards/accuracy_reward": 0.561328125, "rewards/brier_reward": 0.8105875492095947, "rewards/confidence_one_or_zero": 0.00380859375, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.5318828105926514, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.2, "completions/max_terminated_length": 500.2, "completions/mean_length": 230.7822265625, "completions/mean_terminated_length": 230.7822265625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.768, "grad_norm": 0.0017465156270191073, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 818534304.0, "reward": 1.1670047283172607, "reward_std": 0.09165385216474534, "rewards/accuracy_reward": 0.51474609375, "rewards/brier_reward": 0.8192623376846313, "rewards/confidence_one_or_zero": 0.003125, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5243632674217225, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.4, "completions/max_terminated_length": 461.4, "completions/mean_length": 234.57412109375, "completions/mean_terminated_length": 234.57412109375, "completions/min_length": 124.4, "completions/min_terminated_length": 124.4, "epoch": 0.784, "grad_norm": 0.0015328590525314212, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 836110711.0, "reward": 1.1777088403701783, "reward_std": 0.08737877309322357, "rewards/accuracy_reward": 0.55556640625, "rewards/brier_reward": 0.8000456333160401, "rewards/confidence_one_or_zero": 0.00361328125, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.5257382750511169, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.8, "completions/max_terminated_length": 496.8, "completions/mean_length": 237.66533203125, "completions/mean_terminated_length": 237.66533203125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.8, "grad_norm": 0.0011416386114433408, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 853554964.0, "reward": 1.2036036729812623, "reward_std": 0.08387369066476821, "rewards/accuracy_reward": 0.5857421875, "rewards/brier_reward": 0.8214640617370605, "rewards/confidence_one_or_zero": 0.0037109375, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5544834017753602, "step": 250 }, { "epoch": 0.8, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 385.75, "eval_completions/max_terminated_length": 385.75, "eval_completions/mean_length": 245.00457763671875, "eval_completions/mean_terminated_length": 245.00457763671875, "eval_completions/min_length": 139.0, "eval_completions/min_terminated_length": 139.0, "eval_loss": 0.0, "eval_num_tokens": 853554964.0, "eval_reward": 1.12277153134346, "eval_reward_std": 0.28607048839330673, "eval_rewards/accuracy_reward": 0.421875, "eval_rewards/brier_reward": 0.8236669898033142, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.5200195461511612, "eval_runtime": 22.5943, "eval_samples_per_second": 22.129, "eval_steps_per_second": 0.177, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.2, "completions/max_terminated_length": 516.2, "completions/mean_length": 238.09716796875, "completions/mean_terminated_length": 238.09716796875, "completions/min_length": 119.6, "completions/min_terminated_length": 119.6, "epoch": 0.816, "grad_norm": 0.0013309334171935916, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 871092247.0, "reward": 1.1905385255813599, "reward_std": 0.09376283437013626, "rewards/accuracy_reward": 0.58505859375, "rewards/brier_reward": 0.7960173130035401, "rewards/confidence_one_or_zero": 0.0037109375, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5715683579444886, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.4, "completions/max_terminated_length": 518.4, "completions/mean_length": 245.5568359375, "completions/mean_terminated_length": 245.5568359375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.832, "grad_norm": 0.0011804981622844934, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 888615101.0, "reward": 1.1889414310455322, "reward_std": 0.08597695529460907, "rewards/accuracy_reward": 0.55439453125, "rewards/brier_reward": 0.8234872579574585, "rewards/confidence_one_or_zero": 0.00224609375, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5678603529930115, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.8, "completions/max_terminated_length": 555.8, "completions/mean_length": 252.11484375, "completions/mean_terminated_length": 252.11484375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.848, "grad_norm": 0.001554572256281972, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 906211125.0, "reward": 1.1674025774002075, "reward_std": 0.0894822582602501, "rewards/accuracy_reward": 0.52626953125, "rewards/brier_reward": 0.8085344791412353, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5744921803474426, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 795.2, "completions/max_terminated_length": 764.6, "completions/mean_length": 257.57587890625, "completions/mean_terminated_length": 257.45111694335935, "completions/min_length": 134.2, "completions/min_terminated_length": 134.2, "epoch": 0.864, "grad_norm": 0.002882245695218444, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 923835518.0, "reward": 1.2002546310424804, "reward_std": 0.08959609419107437, "rewards/accuracy_reward": 0.5888671875, "rewards/brier_reward": 0.8117385983467102, "rewards/confidence_one_or_zero": 0.00087890625, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.5740029335021972, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 777.2, "completions/max_terminated_length": 575.4, "completions/mean_length": 259.45830078125, "completions/mean_terminated_length": 259.33391723632815, "completions/min_length": 127.6, "completions/min_terminated_length": 127.6, "epoch": 0.88, "grad_norm": 0.001367030548863113, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 941639443.0, "reward": 1.1616749525070191, "reward_std": 0.08919112980365754, "rewards/accuracy_reward": 0.51142578125, "rewards/brier_reward": 0.8120206832885742, "rewards/confidence_one_or_zero": 0.00126953125, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.5385517477989197, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.6, "completions/max_terminated_length": 568.6, "completions/mean_length": 261.7333984375, "completions/mean_terminated_length": 261.7333984375, "completions/min_length": 129.6, "completions/min_terminated_length": 129.6, "epoch": 0.896, "grad_norm": 0.0019051535055041313, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 959430441.0, "reward": 1.1740495443344117, "reward_std": 0.08368157297372818, "rewards/accuracy_reward": 0.5408203125, "rewards/brier_reward": 0.8072776556015014, "rewards/confidence_one_or_zero": 0.00068359375, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5425273656845093, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 786.4, "completions/max_terminated_length": 605.6, "completions/mean_length": 262.71474609375, "completions/mean_terminated_length": 262.591015625, "completions/min_length": 127.6, "completions/min_terminated_length": 127.6, "epoch": 0.912, "grad_norm": 0.0032088656444102526, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 977171936.0, "reward": 1.1776597261428834, "reward_std": 0.0879664734005928, "rewards/accuracy_reward": 0.54130859375, "rewards/brier_reward": 0.8142051458358764, "rewards/confidence_one_or_zero": 0.00048828125, "rewards/format_reward": 0.9998046875, "rewards/mean_confidence_reward": 0.5229726552963256, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 255.93291015625, "completions/mean_terminated_length": 255.93291015625, "completions/min_length": 124.8, "completions/min_terminated_length": 124.8, "epoch": 0.928, "grad_norm": 0.0023952668998390436, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 994819505.0, "reward": 1.1611377239227294, "reward_std": 0.08492133468389511, "rewards/accuracy_reward": 0.5228515625, "rewards/brier_reward": 0.7994229435920716, "rewards/confidence_one_or_zero": 0.00078125, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5169863283634186, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 775.0, "completions/max_terminated_length": 598.2, "completions/mean_length": 257.284765625, "completions/mean_terminated_length": 257.160205078125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.944, "grad_norm": 0.001820826786570251, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 1012429525.0, "reward": 1.173994493484497, "reward_std": 0.08908755034208297, "rewards/accuracy_reward": 0.53642578125, "rewards/brier_reward": 0.8116598725318909, "rewards/confidence_one_or_zero": 0.000390625, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.5185712933540344, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.4, "completions/max_terminated_length": 602.4, "completions/mean_length": 259.2900390625, "completions/mean_terminated_length": 259.2900390625, "completions/min_length": 131.4, "completions/min_terminated_length": 131.4, "epoch": 0.96, "grad_norm": 0.004218410234898329, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 1030024975.0, "reward": 1.1701999187469483, "reward_std": 0.07609933465719224, "rewards/accuracy_reward": 0.5234375, "rewards/brier_reward": 0.8172542452812195, "rewards/confidence_one_or_zero": 0.0001953125, "rewards/format_reward": 0.99970703125, "rewards/mean_confidence_reward": 0.5500380873680115, "step": 300 }, { "epoch": 0.96, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 430.5, "eval_completions/max_terminated_length": 430.5, "eval_completions/mean_length": 257.0390625, "eval_completions/mean_terminated_length": 257.0390625, "eval_completions/min_length": 164.25, "eval_completions/min_terminated_length": 164.25, "eval_loss": 0.0, "eval_num_tokens": 1030024975.0, "eval_reward": 1.1227055788040161, "eval_reward_std": 0.2832227647304535, "eval_rewards/accuracy_reward": 0.439453125, "eval_rewards/brier_reward": 0.8059570342302322, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 1.0, "eval_rewards/mean_confidence_reward": 0.5294921696186066, "eval_runtime": 25.4806, "eval_samples_per_second": 19.623, "eval_steps_per_second": 0.157, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.2, "completions/max_terminated_length": 605.2, "completions/mean_length": 258.0880859375, "completions/mean_terminated_length": 258.0880859375, "completions/min_length": 130.4, "completions/min_terminated_length": 130.4, "epoch": 0.976, "grad_norm": 0.0010340906446799636, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 1047528917.0, "reward": 1.1855917692184448, "reward_std": 0.07872401475906372, "rewards/accuracy_reward": 0.5564453125, "rewards/brier_reward": 0.8147371768951416, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5425136804580688, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 9.765625e-05, "completions/max_length": 781.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 255.09814453125, "completions/mean_terminated_length": 254.97261962890624, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.992, "grad_norm": 0.016297942027449608, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 1065269602.0, "reward": 1.166081428527832, "reward_std": 0.07803938686847686, "rewards/accuracy_reward": 0.521484375, "rewards/brier_reward": 0.8107750654220581, "rewards/confidence_one_or_zero": 0.0001953125, "rewards/format_reward": 0.99990234375, "rewards/mean_confidence_reward": 0.5243340075016022, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 257.6454162597656, "completions/mean_terminated_length": 257.6454162597656, "completions/min_length": 129.5, "completions/min_terminated_length": 129.5, "epoch": 0.9984, "num_tokens": 1072324341.0, "reward": 1.1739696860313416, "reward_std": 0.08398981019854546, "rewards/accuracy_reward": 0.55712890625, "rewards/brier_reward": 0.7908094227313995, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 1.0, "rewards/mean_confidence_reward": 0.5394628942012787, "step": 312, "total_flos": 0.0, "train_loss": 0.004670050070182277, "train_runtime": 88432.5734, "train_samples_per_second": 0.226, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 312, "num_input_tokens_seen": 1072324341, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }