{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4992, "eval_steps": 50, "global_step": 234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04873046875, "completions/max_length": 3832.0, "completions/max_terminated_length": 3832.0, "completions/mean_length": 417.21298828125, "completions/mean_terminated_length": 438.694482421875, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.010666666666666666, "grad_norm": 0.0022994126193225384, "learning_rate": 5.319148936170213e-07, "loss": -0.0106, "num_tokens": 7050533.0, "reward": 0.2718377888202667, "reward_std": 0.3972369432449341, "rewards/accuracy_reward": 0.0904296875, "rewards/brier_reward": 0.11152872890233993, "rewards/confidence_one_or_zero": 0.380078125, "rewards/format_reward": 0.34169921875, "rewards/mean_confidence_reward": 0.5165613055229187, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.047265625, "completions/max_length": 3725.8, "completions/max_terminated_length": 3725.8, "completions/mean_length": 396.103125, "completions/mean_terminated_length": 415.90752563476565, "completions/min_length": 0.0, "completions/min_terminated_length": 2.0, "epoch": 0.021333333333333333, "grad_norm": 0.002011506352573633, "learning_rate": 1.0638297872340427e-06, "loss": -0.0032, "num_tokens": 13869093.0, "reward": 0.36579355001449587, "reward_std": 0.4264163374900818, "rewards/accuracy_reward": 0.119921875, "rewards/brier_reward": 0.14758132696151732, "rewards/confidence_one_or_zero": 0.4490234375, "rewards/format_reward": 0.4640625, "rewards/mean_confidence_reward": 0.6210533976554871, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0349609375, "completions/max_length": 3780.8, "completions/max_terminated_length": 3780.8, "completions/mean_length": 342.1515625, "completions/mean_terminated_length": 354.5928588867188, "completions/min_length": 0.0, "completions/min_terminated_length": 8.8, "epoch": 0.032, "grad_norm": 0.0011837292695418, "learning_rate": 1.595744680851064e-06, "loss": -0.0015, "num_tokens": 20090005.0, "reward": 0.5768917322158813, "reward_std": 0.42254899740219115, "rewards/accuracy_reward": 0.17490234375, "rewards/brier_reward": 0.2182064473628998, "rewards/confidence_one_or_zero": 0.6333984375, "rewards/format_reward": 0.76064453125, "rewards/mean_confidence_reward": 0.8744643926620483, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03291015625, "completions/max_length": 3326.2, "completions/max_terminated_length": 3326.2, "completions/mean_length": 300.54345703125, "completions/mean_terminated_length": 310.7569152832031, "completions/min_length": 0.0, "completions/min_terminated_length": 36.6, "epoch": 0.042666666666666665, "grad_norm": 0.005445198621600866, "learning_rate": 2.1276595744680853e-06, "loss": -0.0134, "num_tokens": 25946610.0, "reward": 0.6536507606506348, "reward_std": 0.3477814972400665, "rewards/accuracy_reward": 0.167578125, "rewards/brier_reward": 0.2340293437242508, "rewards/confidence_one_or_zero": 0.5921875, "rewards/format_reward": 0.9056640625, "rewards/mean_confidence_reward": 0.906139588356018, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0265625, "completions/max_length": 3896.2, "completions/max_terminated_length": 3896.2, "completions/mean_length": 271.092578125, "completions/mean_terminated_length": 278.50535888671874, "completions/min_length": 0.0, "completions/min_terminated_length": 45.2, "epoch": 0.05333333333333334, "grad_norm": 0.0007234986987896264, "learning_rate": 2.6595744680851065e-06, "loss": -0.019, "num_tokens": 31458182.0, "reward": 0.7525559663772583, "reward_std": 0.3314167380332947, "rewards/accuracy_reward": 0.2111328125, "rewards/brier_reward": 0.3317470133304596, "rewards/confidence_one_or_zero": 0.376171875, "rewards/format_reward": 0.96220703125, "rewards/mean_confidence_reward": 0.8775923013687134, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.019921875, "completions/max_length": 3383.8, "completions/max_terminated_length": 3383.8, "completions/mean_length": 309.6873046875, "completions/mean_terminated_length": 315.9191589355469, "completions/min_length": 0.0, "completions/min_terminated_length": 48.2, "epoch": 0.064, "grad_norm": 0.0004548918514046818, "learning_rate": 3.191489361702128e-06, "loss": -0.0139, "num_tokens": 37443556.0, "reward": 0.8603755593299866, "reward_std": 0.3160943269729614, "rewards/accuracy_reward": 0.25341796875, "rewards/brier_reward": 0.4920231521129608, "rewards/confidence_one_or_zero": 0.08056640625, "rewards/format_reward": 0.97529296875, "rewards/mean_confidence_reward": 0.7704261898994446, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0197265625, "completions/max_length": 3442.8, "completions/max_terminated_length": 3442.8, "completions/mean_length": 341.35791015625, "completions/mean_terminated_length": 348.3955383300781, "completions/min_length": 0.0, "completions/min_terminated_length": 87.8, "epoch": 0.07466666666666667, "grad_norm": 0.00027860456611961126, "learning_rate": 3.723404255319149e-06, "loss": -0.0176, "num_tokens": 43686581.0, "reward": 0.9539328455924988, "reward_std": 0.25055329501628876, "rewards/accuracy_reward": 0.2630859375, "rewards/brier_reward": 0.6670343160629273, "rewards/confidence_one_or_zero": 0.0181640625, "rewards/format_reward": 0.977734375, "rewards/mean_confidence_reward": 0.5359978020191193, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02119140625, "completions/max_length": 3646.2, "completions/max_terminated_length": 3646.2, "completions/mean_length": 370.0533203125, "completions/mean_terminated_length": 378.1131530761719, "completions/min_length": 0.0, "completions/min_terminated_length": 87.8, "epoch": 0.08533333333333333, "grad_norm": 0.00019421246543060988, "learning_rate": 4.255319148936171e-06, "loss": -0.0236, "num_tokens": 50196599.0, "reward": 1.005708146095276, "reward_std": 0.19348926544189454, "rewards/accuracy_reward": 0.30771484375, "rewards/brier_reward": 0.7264483809471131, "rewards/confidence_one_or_zero": 0.02001953125, "rewards/format_reward": 0.97724609375, "rewards/mean_confidence_reward": 0.32700340151786805, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0263671875, "completions/max_length": 3611.4, "completions/max_terminated_length": 3611.4, "completions/mean_length": 419.93203125, "completions/mean_terminated_length": 431.3776428222656, "completions/min_length": 0.0, "completions/min_terminated_length": 106.8, "epoch": 0.096, "grad_norm": 0.00017933457274921238, "learning_rate": 4.787234042553192e-06, "loss": -0.0264, "num_tokens": 57285631.0, "reward": 1.0260677814483643, "reward_std": 0.17603319883346558, "rewards/accuracy_reward": 0.37861328125, "rewards/brier_reward": 0.701446795463562, "rewards/confidence_one_or_zero": 0.02587890625, "rewards/format_reward": 0.9720703125, "rewards/mean_confidence_reward": 0.23751758635044098, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.031640625, "completions/max_length": 3522.6, "completions/max_terminated_length": 3522.6, "completions/mean_length": 460.63408203125, "completions/mean_terminated_length": 475.79522094726565, "completions/min_length": 0.0, "completions/min_terminated_length": 116.2, "epoch": 0.10666666666666667, "grad_norm": 0.00020549257169477642, "learning_rate": 4.919786096256685e-06, "loss": -0.0343, "num_tokens": 64789148.0, "reward": 1.0377721786499023, "reward_std": 0.19848274290561677, "rewards/accuracy_reward": 0.403125, "rewards/brier_reward": 0.7054211854934692, "rewards/confidence_one_or_zero": 0.016796875, "rewards/format_reward": 0.9669921875, "rewards/mean_confidence_reward": 0.28423587083816526, "step": 50 }, { "epoch": 0.10666666666666667, "eval_completions/clipped_ratio": 0.033203125, "eval_completions/max_length": 1511.6875, "eval_completions/max_terminated_length": 1511.6875, "eval_completions/mean_length": 481.26543045043945, "eval_completions/mean_terminated_length": 497.91385078430176, "eval_completions/min_length": 30.5, "eval_completions/min_terminated_length": 152.3125, "eval_loss": 0.0, "eval_num_tokens": 64789148.0, "eval_reward": 1.0428522787988186, "eval_reward_std": 0.2570919021964073, "eval_rewards/accuracy_reward": 0.41015625, "eval_rewards/brier_reward": 0.7097212113440037, "eval_rewards/confidence_one_or_zero": 0.021484375, "eval_rewards/format_reward": 0.9658203125, "eval_rewards/mean_confidence_reward": 0.31723164208233356, "eval_runtime": 330.2735, "eval_samples_per_second": 3.028, "eval_steps_per_second": 0.048, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 3745.6, "completions/max_terminated_length": 3745.6, "completions/mean_length": 499.6552734375, "completions/mean_terminated_length": 518.9554016113282, "completions/min_length": 0.0, "completions/min_terminated_length": 102.6, "epoch": 0.11733333333333333, "grad_norm": 0.00023679890728089958, "learning_rate": 4.786096256684493e-06, "loss": -0.0424, "num_tokens": 72641586.0, "reward": 1.0429035425186157, "reward_std": 0.2101288139820099, "rewards/accuracy_reward": 0.41943359375, "rewards/brier_reward": 0.7049405694007873, "rewards/confidence_one_or_zero": 0.01376953125, "rewards/format_reward": 0.96142578125, "rewards/mean_confidence_reward": 0.34695433378219603, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03681640625, "completions/max_length": 3240.2, "completions/max_terminated_length": 3240.2, "completions/mean_length": 516.13466796875, "completions/mean_terminated_length": 535.995068359375, "completions/min_length": 0.0, "completions/min_terminated_length": 116.2, "epoch": 0.128, "grad_norm": 0.00020350873819552362, "learning_rate": 4.6524064171123e-06, "loss": -0.0377, "num_tokens": 80683029.0, "reward": 1.0565701007843018, "reward_std": 0.21977659463882446, "rewards/accuracy_reward": 0.4330078125, "rewards/brier_reward": 0.718209958076477, "rewards/confidence_one_or_zero": 0.01044921875, "rewards/format_reward": 0.9619140625, "rewards/mean_confidence_reward": 0.40334631204605104, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03916015625, "completions/max_length": 3614.2, "completions/max_terminated_length": 3614.2, "completions/mean_length": 548.8623046875, "completions/mean_terminated_length": 571.418896484375, "completions/min_length": 0.0, "completions/min_terminated_length": 146.8, "epoch": 0.13866666666666666, "grad_norm": 0.00023808285186532885, "learning_rate": 4.518716577540107e-06, "loss": -0.0388, "num_tokens": 89039955.0, "reward": 1.0643263101577758, "reward_std": 0.22522514760494233, "rewards/accuracy_reward": 0.45087890625, "rewards/brier_reward": 0.718096923828125, "rewards/confidence_one_or_zero": 0.00546875, "rewards/format_reward": 0.95966796875, "rewards/mean_confidence_reward": 0.4328162491321564, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04814453125, "completions/max_length": 3548.4, "completions/max_terminated_length": 3548.4, "completions/mean_length": 565.52802734375, "completions/mean_terminated_length": 594.6517456054687, "completions/min_length": 0.0, "completions/min_terminated_length": 121.8, "epoch": 0.14933333333333335, "grad_norm": 0.003085868200287223, "learning_rate": 4.385026737967915e-06, "loss": -0.046, "num_tokens": 97599986.0, "reward": 1.06097412109375, "reward_std": 0.23501123189926149, "rewards/accuracy_reward": 0.46328125, "rewards/brier_reward": 0.7087554454803466, "rewards/confidence_one_or_zero": 0.002734375, "rewards/format_reward": 0.94990234375, "rewards/mean_confidence_reward": 0.4535819053649902, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03466796875, "completions/max_length": 3784.8, "completions/max_terminated_length": 3784.8, "completions/mean_length": 603.82158203125, "completions/mean_terminated_length": 625.44541015625, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.16, "grad_norm": 0.0002215144777437672, "learning_rate": 4.251336898395722e-06, "loss": -0.0343, "num_tokens": 106515951.0, "reward": 1.0931342601776124, "reward_std": 0.21771803498268127, "rewards/accuracy_reward": 0.50087890625, "rewards/brier_reward": 0.7211225032806396, "rewards/confidence_one_or_zero": 0.00166015625, "rewards/format_reward": 0.9642578125, "rewards/mean_confidence_reward": 0.46166038513183594, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03857421875, "completions/max_length": 3760.2, "completions/max_terminated_length": 3760.2, "completions/mean_length": 598.0275390625, "completions/mean_terminated_length": 622.232177734375, "completions/min_length": 0.0, "completions/min_terminated_length": 158.4, "epoch": 0.17066666666666666, "grad_norm": 0.0002537222462706268, "learning_rate": 4.11764705882353e-06, "loss": -0.0398, "num_tokens": 115384169.0, "reward": 1.0766690015792846, "reward_std": 0.23046172559261321, "rewards/accuracy_reward": 0.47578125, "rewards/brier_reward": 0.7174887418746948, "rewards/confidence_one_or_zero": 0.0009765625, "rewards/format_reward": 0.96005859375, "rewards/mean_confidence_reward": 0.46985026001930236, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03154296875, "completions/max_length": 3239.2, "completions/max_terminated_length": 3239.2, "completions/mean_length": 640.25078125, "completions/mean_terminated_length": 661.4207153320312, "completions/min_length": 0.0, "completions/min_terminated_length": 168.6, "epoch": 0.18133333333333335, "grad_norm": 0.0002039925311692059, "learning_rate": 3.983957219251337e-06, "loss": -0.0322, "num_tokens": 124663409.0, "reward": 1.111165952682495, "reward_std": 0.21703881323337554, "rewards/accuracy_reward": 0.53037109375, "rewards/brier_reward": 0.7248614072799683, "rewards/confidence_one_or_zero": 0.00087890625, "rewards/format_reward": 0.96708984375, "rewards/mean_confidence_reward": 0.47759194374084474, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0322265625, "completions/max_length": 3809.0, "completions/max_terminated_length": 3809.0, "completions/mean_length": 674.80703125, "completions/mean_terminated_length": 697.274072265625, "completions/min_length": 0.0, "completions/min_terminated_length": 174.8, "epoch": 0.192, "grad_norm": 0.0002259796456201002, "learning_rate": 3.850267379679145e-06, "loss": -0.0323, "num_tokens": 134289081.0, "reward": 1.1023191452026366, "reward_std": 0.21436747312545776, "rewards/accuracy_reward": 0.5142578125, "rewards/brier_reward": 0.7240622282028198, "rewards/confidence_one_or_zero": 0.0001953125, "rewards/format_reward": 0.96630859375, "rewards/mean_confidence_reward": 0.4761265993118286, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.034375, "completions/max_length": 3722.0, "completions/max_terminated_length": 3722.0, "completions/mean_length": 695.24814453125, "completions/mean_terminated_length": 720.0158325195313, "completions/min_length": 0.0, "completions/min_terminated_length": 201.2, "epoch": 0.20266666666666666, "grad_norm": 0.0002535897074267268, "learning_rate": 3.716577540106952e-06, "loss": -0.0348, "num_tokens": 144155206.0, "reward": 1.1052819728851317, "reward_std": 0.22034441232681273, "rewards/accuracy_reward": 0.52578125, "rewards/brier_reward": 0.721687114238739, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.9630859375, "rewards/mean_confidence_reward": 0.4807871162891388, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03681640625, "completions/max_length": 3776.0, "completions/max_terminated_length": 3776.0, "completions/mean_length": 714.55859375, "completions/mean_terminated_length": 742.0417602539062, "completions/min_length": 0.0, "completions/min_terminated_length": 184.0, "epoch": 0.21333333333333335, "grad_norm": 0.00020959587709512562, "learning_rate": 3.5828877005347597e-06, "loss": -0.0367, "num_tokens": 154205790.0, "reward": 1.1090866804122925, "reward_std": 0.21838183403015138, "rewards/accuracy_reward": 0.53486328125, "rewards/brier_reward": 0.7215817093849182, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.96171875, "rewards/mean_confidence_reward": 0.48115434050559996, "step": 100 }, { "epoch": 0.21333333333333335, "eval_completions/clipped_ratio": 0.0322265625, "eval_completions/max_length": 2135.5, "eval_completions/max_terminated_length": 2135.5, "eval_completions/mean_length": 729.1765632629395, "eval_completions/mean_terminated_length": 753.6238555908203, "eval_completions/min_length": 32.8125, "eval_completions/min_terminated_length": 265.25, "eval_loss": 0.0, "eval_num_tokens": 154205790.0, "eval_reward": 1.106356181204319, "eval_reward_std": 0.3143840888515115, "eval_rewards/accuracy_reward": 0.5185546875, "eval_rewards/brier_reward": 0.7273510619997978, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.966796875, "eval_rewards/mean_confidence_reward": 0.4822753965854645, "eval_runtime": 345.9345, "eval_samples_per_second": 2.891, "eval_steps_per_second": 0.046, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02333984375, "completions/max_length": 3680.4, "completions/max_terminated_length": 3680.4, "completions/mean_length": 740.38525390625, "completions/mean_terminated_length": 758.0415649414062, "completions/min_length": 0.0, "completions/min_terminated_length": 166.2, "epoch": 0.224, "grad_norm": 0.000193859072169289, "learning_rate": 3.449197860962567e-06, "loss": -0.0224, "num_tokens": 164522343.0, "reward": 1.1161011457443237, "reward_std": 0.1981082409620285, "rewards/accuracy_reward": 0.52265625, "rewards/brier_reward": 0.7332667350769043, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.97626953125, "rewards/mean_confidence_reward": 0.4871376514434814, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0263671875, "completions/max_length": 3624.6, "completions/max_terminated_length": 3624.6, "completions/mean_length": 750.2166015625, "completions/mean_terminated_length": 770.4773681640625, "completions/min_length": 0.0, "completions/min_terminated_length": 206.0, "epoch": 0.23466666666666666, "grad_norm": 0.00020454383047763258, "learning_rate": 3.3155080213903747e-06, "loss": -0.022, "num_tokens": 174955409.0, "reward": 1.1203881740570067, "reward_std": 0.19748010039329528, "rewards/accuracy_reward": 0.54072265625, "rewards/brier_reward": 0.7278758764266968, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.97216796875, "rewards/mean_confidence_reward": 0.48651269674301145, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03046875, "completions/max_length": 3376.2, "completions/max_terminated_length": 3376.2, "completions/mean_length": 749.41201171875, "completions/mean_terminated_length": 772.8901977539062, "completions/min_length": 0.0, "completions/min_terminated_length": 196.6, "epoch": 0.24533333333333332, "grad_norm": 0.0002078731486108154, "learning_rate": 3.181818181818182e-06, "loss": -0.0276, "num_tokens": 185371564.0, "reward": 1.11997492313385, "reward_std": 0.20475648939609528, "rewards/accuracy_reward": 0.544140625, "rewards/brier_reward": 0.7269518375396729, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.96884765625, "rewards/mean_confidence_reward": 0.4832145094871521, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 3540.6, "completions/max_terminated_length": 3540.6, "completions/mean_length": 756.285546875, "completions/mean_terminated_length": 782.4071899414063, "completions/min_length": 0.0, "completions/min_terminated_length": 199.2, "epoch": 0.256, "grad_norm": 0.00019684557628352195, "learning_rate": 3.0481283422459896e-06, "loss": -0.0299, "num_tokens": 195867160.0, "reward": 1.1076245546340941, "reward_std": 0.20969551503658296, "rewards/accuracy_reward": 0.5251953125, "rewards/brier_reward": 0.7241261005401611, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.96591796875, "rewards/mean_confidence_reward": 0.48320378065109254, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0248046875, "completions/max_length": 3629.2, "completions/max_terminated_length": 3629.2, "completions/mean_length": 731.28798828125, "completions/mean_terminated_length": 749.828466796875, "completions/min_length": 0.0, "completions/min_terminated_length": 171.6, "epoch": 0.26666666666666666, "grad_norm": 0.00023284385679289699, "learning_rate": 2.914438502673797e-06, "loss": -0.024, "num_tokens": 206106365.0, "reward": 1.1237028598785401, "reward_std": 0.20264139473438264, "rewards/accuracy_reward": 0.542578125, "rewards/brier_reward": 0.7308919787406921, "rewards/confidence_one_or_zero": 0.00087890625, "rewards/format_reward": 0.97392578125, "rewards/mean_confidence_reward": 0.4858167290687561, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02783203125, "completions/max_length": 3273.8, "completions/max_terminated_length": 3273.8, "completions/mean_length": 706.0017578125, "completions/mean_terminated_length": 726.1967407226563, "completions/min_length": 0.0, "completions/min_terminated_length": 196.6, "epoch": 0.2773333333333333, "grad_norm": 0.0002099503471981734, "learning_rate": 2.7807486631016045e-06, "loss": -0.0268, "num_tokens": 216081839.0, "reward": 1.1141971349716187, "reward_std": 0.1988514244556427, "rewards/accuracy_reward": 0.52919921875, "rewards/brier_reward": 0.7281892657279968, "rewards/confidence_one_or_zero": 0.0001953125, "rewards/format_reward": 0.97099609375, "rewards/mean_confidence_reward": 0.4858289957046509, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02109375, "completions/max_length": 3532.0, "completions/max_terminated_length": 3532.0, "completions/mean_length": 696.258984375, "completions/mean_terminated_length": 711.3540283203125, "completions/min_length": 0.0, "completions/min_terminated_length": 211.4, "epoch": 0.288, "grad_norm": 0.00021950564405415207, "learning_rate": 2.647058823529412e-06, "loss": -0.0214, "num_tokens": 225948139.0, "reward": 1.136391854286194, "reward_std": 0.18573434352874757, "rewards/accuracy_reward": 0.56083984375, "rewards/brier_reward": 0.7337113857269287, "rewards/confidence_one_or_zero": 0.0001953125, "rewards/format_reward": 0.97822265625, "rewards/mean_confidence_reward": 0.4880725979804993, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01826171875, "completions/max_length": 3008.6, "completions/max_terminated_length": 3008.6, "completions/mean_length": 709.566796875, "completions/mean_terminated_length": 722.8854858398438, "completions/min_length": 0.0, "completions/min_terminated_length": 180.8, "epoch": 0.2986666666666667, "grad_norm": 0.0001989748125197366, "learning_rate": 2.5133689839572194e-06, "loss": -0.0177, "num_tokens": 235999223.0, "reward": 1.139591360092163, "reward_std": 0.17391741871833802, "rewards/accuracy_reward": 0.56337890625, "rewards/brier_reward": 0.7346415877342224, "rewards/confidence_one_or_zero": 0.0001953125, "rewards/format_reward": 0.98115234375, "rewards/mean_confidence_reward": 0.49123767018318176, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02548828125, "completions/max_length": 3968.8, "completions/max_terminated_length": 3968.8, "completions/mean_length": 752.06943359375, "completions/mean_terminated_length": 771.8078369140625, "completions/min_length": 0.0, "completions/min_terminated_length": 199.0, "epoch": 0.30933333333333335, "grad_norm": 0.000300207786494866, "learning_rate": 2.379679144385027e-06, "loss": -0.0249, "num_tokens": 246474910.0, "reward": 1.1206291913986206, "reward_std": 0.19275815784931183, "rewards/accuracy_reward": 0.53828125, "rewards/brier_reward": 0.7297251224517822, "rewards/confidence_one_or_zero": 0.00126953125, "rewards/format_reward": 0.9732421875, "rewards/mean_confidence_reward": 0.48650553822517395, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02158203125, "completions/max_length": 3529.4, "completions/max_terminated_length": 3529.4, "completions/mean_length": 745.94892578125, "completions/mean_terminated_length": 762.421044921875, "completions/min_length": 0.0, "completions/min_terminated_length": 178.4, "epoch": 0.32, "grad_norm": 0.00023590361524838954, "learning_rate": 2.2459893048128343e-06, "loss": -0.0204, "num_tokens": 256861427.0, "reward": 1.1211992502212524, "reward_std": 0.19113921225070954, "rewards/accuracy_reward": 0.5330078125, "rewards/brier_reward": 0.7318416714668274, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.9775390625, "rewards/mean_confidence_reward": 0.4905679523944855, "step": 150 }, { "epoch": 0.32, "eval_completions/clipped_ratio": 0.015625, "eval_completions/max_length": 1815.6875, "eval_completions/max_terminated_length": 1815.6875, "eval_completions/mean_length": 742.1876945495605, "eval_completions/mean_terminated_length": 754.273509979248, "eval_completions/min_length": 137.6875, "eval_completions/min_terminated_length": 270.0, "eval_loss": 0.0, "eval_num_tokens": 256861427.0, "eval_reward": 1.1376136094331741, "eval_reward_std": 0.2811641450971365, "eval_rewards/accuracy_reward": 0.552734375, "eval_rewards/brier_reward": 0.7390844747424126, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9833984375, "eval_rewards/mean_confidence_reward": 0.49321289360523224, "eval_runtime": 284.0103, "eval_samples_per_second": 3.521, "eval_steps_per_second": 0.056, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01787109375, "completions/max_length": 3623.0, "completions/max_terminated_length": 3623.0, "completions/mean_length": 724.40556640625, "completions/mean_terminated_length": 737.5683471679688, "completions/min_length": 0.0, "completions/min_terminated_length": 180.8, "epoch": 0.33066666666666666, "grad_norm": 0.0002188891521655023, "learning_rate": 2.112299465240642e-06, "loss": -0.0164, "num_tokens": 267030892.0, "reward": 1.133889079093933, "reward_std": 0.17375424206256868, "rewards/accuracy_reward": 0.55048828125, "rewards/brier_reward": 0.7355417370796203, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98173828125, "rewards/mean_confidence_reward": 0.4923380553722382, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01875, "completions/max_length": 3686.2, "completions/max_terminated_length": 3686.2, "completions/mean_length": 724.2431640625, "completions/mean_terminated_length": 738.1012939453125, "completions/min_length": 0.0, "completions/min_terminated_length": 207.6, "epoch": 0.3413333333333333, "grad_norm": 0.00019207321747671813, "learning_rate": 1.9786096256684497e-06, "loss": -0.0167, "num_tokens": 277180582.0, "reward": 1.1334651947021483, "reward_std": 0.1689860612154007, "rewards/accuracy_reward": 0.55, "rewards/brier_reward": 0.7361587643623352, "rewards/confidence_one_or_zero": 0.00078125, "rewards/format_reward": 0.98076171875, "rewards/mean_confidence_reward": 0.4910940706729889, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01513671875, "completions/max_length": 3753.4, "completions/max_terminated_length": 3753.4, "completions/mean_length": 762.5583984375, "completions/mean_terminated_length": 774.3962524414062, "completions/min_length": 0.0, "completions/min_terminated_length": 216.0, "epoch": 0.352, "grad_norm": 0.00017471906903665513, "learning_rate": 1.8449197860962567e-06, "loss": -0.014, "num_tokens": 287740188.0, "reward": 1.1528494119644166, "reward_std": 0.17534538209438325, "rewards/accuracy_reward": 0.583984375, "rewards/brier_reward": 0.7377201795578003, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.983984375, "rewards/mean_confidence_reward": 0.4953427791595459, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02158203125, "completions/max_length": 3530.6, "completions/max_terminated_length": 3530.6, "completions/mean_length": 787.61787109375, "completions/mean_terminated_length": 804.80068359375, "completions/min_length": 0.0, "completions/min_terminated_length": 206.8, "epoch": 0.3626666666666667, "grad_norm": 0.0002157771377824247, "learning_rate": 1.7112299465240642e-06, "loss": -0.0207, "num_tokens": 298560467.0, "reward": 1.1448474168777465, "reward_std": 0.1869949847459793, "rewards/accuracy_reward": 0.5779296875, "rewards/brier_reward": 0.7339232444763184, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.97783203125, "rewards/mean_confidence_reward": 0.4906333088874817, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01767578125, "completions/max_length": 3371.4, "completions/max_terminated_length": 3371.4, "completions/mean_length": 764.63466796875, "completions/mean_terminated_length": 778.4362426757813, "completions/min_length": 0.0, "completions/min_terminated_length": 211.6, "epoch": 0.37333333333333335, "grad_norm": 0.00021693432063329965, "learning_rate": 1.5775401069518716e-06, "loss": -0.0178, "num_tokens": 309152310.0, "reward": 1.1474551439285279, "reward_std": 0.18755724132061005, "rewards/accuracy_reward": 0.57763671875, "rewards/brier_reward": 0.7359159588813782, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.98134765625, "rewards/mean_confidence_reward": 0.49203906655311586, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02353515625, "completions/max_length": 3407.4, "completions/max_terminated_length": 3407.4, "completions/mean_length": 772.02041015625, "completions/mean_terminated_length": 790.5445190429688, "completions/min_length": 0.0, "completions/min_terminated_length": 224.2, "epoch": 0.384, "grad_norm": 0.0001946146658156067, "learning_rate": 1.4438502673796793e-06, "loss": -0.0226, "num_tokens": 319806791.0, "reward": 1.1229839086532594, "reward_std": 0.17491495311260224, "rewards/accuracy_reward": 0.5380859375, "rewards/brier_reward": 0.7317001938819885, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.976171875, "rewards/mean_confidence_reward": 0.4897919952869415, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0193359375, "completions/max_length": 3332.2, "completions/max_terminated_length": 3332.2, "completions/mean_length": 774.34521484375, "completions/mean_terminated_length": 789.7105224609375, "completions/min_length": 0.0, "completions/min_terminated_length": 216.6, "epoch": 0.39466666666666667, "grad_norm": 0.00021305667178239673, "learning_rate": 1.3101604278074868e-06, "loss": -0.0173, "num_tokens": 330535798.0, "reward": 1.1333304166793823, "reward_std": 0.17513595819473265, "rewards/accuracy_reward": 0.55146484375, "rewards/brier_reward": 0.7354985475540161, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.9796875, "rewards/mean_confidence_reward": 0.4905791044235229, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02119140625, "completions/max_length": 3443.2, "completions/max_terminated_length": 3443.2, "completions/mean_length": 769.52978515625, "completions/mean_terminated_length": 786.377783203125, "completions/min_length": 0.0, "completions/min_terminated_length": 223.0, "epoch": 0.4053333333333333, "grad_norm": 0.00022391976381186396, "learning_rate": 1.1764705882352942e-06, "loss": -0.0204, "num_tokens": 341155271.0, "reward": 1.139868140220642, "reward_std": 0.19301997423171996, "rewards/accuracy_reward": 0.56884765625, "rewards/brier_reward": 0.7333398222923279, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9775390625, "rewards/mean_confidence_reward": 0.4906933605670929, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189453125, "completions/max_length": 3776.8, "completions/max_terminated_length": 3776.8, "completions/mean_length": 771.45654296875, "completions/mean_terminated_length": 786.5269165039062, "completions/min_length": 0.0, "completions/min_terminated_length": 209.4, "epoch": 0.416, "grad_norm": 0.000211334801861085, "learning_rate": 1.0427807486631017e-06, "loss": -0.0174, "num_tokens": 351822186.0, "reward": 1.1492414712905883, "reward_std": 0.17223585546016693, "rewards/accuracy_reward": 0.5830078125, "rewards/brier_reward": 0.7351918578147888, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.9802734375, "rewards/mean_confidence_reward": 0.48971192836761473, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02529296875, "completions/max_length": 4013.0, "completions/max_terminated_length": 4013.0, "completions/mean_length": 776.52587890625, "completions/mean_terminated_length": 796.702392578125, "completions/min_length": 0.0, "completions/min_terminated_length": 232.0, "epoch": 0.4266666666666667, "grad_norm": 0.00024326991115231067, "learning_rate": 9.090909090909091e-07, "loss": -0.024, "num_tokens": 362520019.0, "reward": 1.136240339279175, "reward_std": 0.19438377916812896, "rewards/accuracy_reward": 0.56904296875, "rewards/brier_reward": 0.7294045448303222, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.9740234375, "rewards/mean_confidence_reward": 0.4875830113887787, "step": 200 }, { "epoch": 0.4266666666666667, "eval_completions/clipped_ratio": 0.0205078125, "eval_completions/max_length": 1878.5, "eval_completions/max_terminated_length": 1878.5, "eval_completions/mean_length": 764.9681663513184, "eval_completions/mean_terminated_length": 780.9868965148926, "eval_completions/min_length": 96.25, "eval_completions/min_terminated_length": 297.125, "eval_loss": 0.0, "eval_num_tokens": 362520019.0, "eval_reward": 1.1360864490270615, "eval_reward_std": 0.2948375102132559, "eval_rewards/accuracy_reward": 0.560546875, "eval_rewards/brier_reward": 0.7340771444141865, "eval_rewards/confidence_one_or_zero": 0.0, "eval_rewards/format_reward": 0.9775390625, "eval_rewards/mean_confidence_reward": 0.4896484352648258, "eval_runtime": 323.2058, "eval_samples_per_second": 3.094, "eval_steps_per_second": 0.05, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02255859375, "completions/max_length": 3584.8, "completions/max_terminated_length": 3584.8, "completions/mean_length": 765.71416015625, "completions/mean_terminated_length": 783.431103515625, "completions/min_length": 0.0, "completions/min_terminated_length": 215.8, "epoch": 0.43733333333333335, "grad_norm": 0.00024016403767745942, "learning_rate": 7.754010695187167e-07, "loss": -0.0218, "num_tokens": 373108388.0, "reward": 1.1395012855529785, "reward_std": 0.18607062399387359, "rewards/accuracy_reward": 0.56943359375, "rewards/brier_reward": 0.732899010181427, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.97666015625, "rewards/mean_confidence_reward": 0.48821924328804017, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02265625, "completions/max_length": 3176.4, "completions/max_terminated_length": 3176.4, "completions/mean_length": 786.155078125, "completions/mean_terminated_length": 804.3975830078125, "completions/min_length": 0.0, "completions/min_terminated_length": 215.4, "epoch": 0.448, "grad_norm": 0.00021111531532369554, "learning_rate": 6.417112299465242e-07, "loss": -0.0192, "num_tokens": 383926040.0, "reward": 1.1250059604644775, "reward_std": 0.19036045372486116, "rewards/accuracy_reward": 0.54189453125, "rewards/brier_reward": 0.7315450549125672, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9765625, "rewards/mean_confidence_reward": 0.4895175814628601, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.026171875, "completions/max_length": 3783.6, "completions/max_terminated_length": 3783.6, "completions/mean_length": 765.16962890625, "completions/mean_terminated_length": 785.7530517578125, "completions/min_length": 0.0, "completions/min_terminated_length": 227.4, "epoch": 0.45866666666666667, "grad_norm": 0.00020393490558490157, "learning_rate": 5.080213903743316e-07, "loss": -0.0257, "num_tokens": 394495009.0, "reward": 1.1425009727478028, "reward_std": 0.19886362850666045, "rewards/accuracy_reward": 0.58173828125, "rewards/brier_reward": 0.7300117135047912, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9732421875, "rewards/mean_confidence_reward": 0.48793848156929015, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0228515625, "completions/max_length": 3656.6, "completions/max_terminated_length": 3656.6, "completions/mean_length": 780.3501953125, "completions/mean_terminated_length": 798.559521484375, "completions/min_length": 0.0, "completions/min_terminated_length": 234.6, "epoch": 0.4693333333333333, "grad_norm": 0.00024207662499975413, "learning_rate": 3.7433155080213904e-07, "loss": -0.0225, "num_tokens": 405219299.0, "reward": 1.1340527296066285, "reward_std": 0.1835294783115387, "rewards/accuracy_reward": 0.559765625, "rewards/brier_reward": 0.731767475605011, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9765625, "rewards/mean_confidence_reward": 0.4893447160720825, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0205078125, "completions/max_length": 3377.4, "completions/max_terminated_length": 3377.4, "completions/mean_length": 782.55126953125, "completions/mean_terminated_length": 798.9751098632812, "completions/min_length": 0.0, "completions/min_terminated_length": 226.4, "epoch": 0.48, "grad_norm": 0.00020044122356921434, "learning_rate": 2.4064171122994655e-07, "loss": -0.021, "num_tokens": 415955856.0, "reward": 1.140486478805542, "reward_std": 0.1799767643213272, "rewards/accuracy_reward": 0.566796875, "rewards/brier_reward": 0.7353576302528382, "rewards/confidence_one_or_zero": 0.0001953125, "rewards/format_reward": 0.97880859375, "rewards/mean_confidence_reward": 0.490503066778183, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02333984375, "completions/max_length": 3544.2, "completions/max_terminated_length": 3544.2, "completions/mean_length": 787.57099609375, "completions/mean_terminated_length": 806.4834838867188, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.49066666666666664, "grad_norm": 0.00020212125673424453, "learning_rate": 1.0695187165775401e-07, "loss": -0.0235, "num_tokens": 426747335.0, "reward": 1.1553166151046752, "reward_std": 0.19148518443107604, "rewards/accuracy_reward": 0.60361328125, "rewards/brier_reward": 0.7308382272720337, "rewards/confidence_one_or_zero": 9.765625e-05, "rewards/format_reward": 0.976171875, "rewards/mean_confidence_reward": 0.48845637440681455, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.018798828125, "completions/max_length": 3411.5, "completions/max_terminated_length": 3411.5, "completions/mean_length": 780.903076171875, "completions/mean_terminated_length": 796.0233154296875, "completions/min_length": 0.0, "completions/min_terminated_length": 200.5, "epoch": 0.4992, "num_tokens": 435349741.0, "reward": 1.1434746384620667, "reward_std": 0.18124820291996002, "rewards/accuracy_reward": 0.5712890625, "rewards/brier_reward": 0.7353035807609558, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9803466796875, "rewards/mean_confidence_reward": 0.49278198927640915, "step": 234, "total_flos": 0.0, "train_loss": -0.01013354094237344, "train_runtime": 38659.6234, "train_samples_per_second": 0.388, "train_steps_per_second": 0.006 } ], "logging_steps": 5, "max_steps": 234, "num_input_tokens_seen": 435349741, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }