{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.23386342376052385, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3750000298023224, "completions/max_length": 2048.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 1100.21435546875, "completions/mean_terminated_length": 531.5428466796875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.7699412405490875, "epoch": 0.0009354536950420954, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.355812668800354, "kl": 0.0, "learning_rate": 1e-05, "loss": 0.1222, "num_tokens": 202696.0, "reward": 0.535714328289032, "reward_std": 0.500964343547821, "rewards/math_reward/mean": 0.5357142686843872, "rewards/math_reward/std": 0.5009642839431763, "sampling/importance_sampling_ratio/max": 2.9837446212768555, "sampling/importance_sampling_ratio/mean": 0.7439863085746765, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4839599132537842, "sampling/sampling_logp_difference/mean": 0.01401771605014801, "step": 1, "step_time": 51.70103603997268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 791.8750610351562, "completions/mean_terminated_length": 449.29547119140625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.7248611152172089, "epoch": 0.0018709073900841909, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.8265047073364258, "kl": 0.0008526578167220578, "learning_rate": 9.99064546304958e-06, "loss": 0.1208, "num_tokens": 376978.0, "reward": 0.6517857313156128, "reward_std": 0.47854551672935486, "rewards/math_reward/mean": 0.6517857313156128, "rewards/math_reward/std": 0.47854548692703247, "sampling/importance_sampling_ratio/max": 2.9564757347106934, "sampling/importance_sampling_ratio/mean": 0.7252377271652222, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7673549652099609, "sampling/sampling_logp_difference/mean": 0.013798754662275314, "step": 2, "step_time": 32.49119739304297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 721.4732666015625, "completions/mean_terminated_length": 397.21112060546875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5675854757428169, "epoch": 0.002806361085126286, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3531298041343689, "kl": 0.0018229869601782411, "learning_rate": 9.981290926099158e-06, "loss": -0.0157, "num_tokens": 537471.0, "reward": 0.6339285969734192, "reward_std": 0.4838944375514984, "rewards/math_reward/mean": 0.6339285969734192, "rewards/math_reward/std": 0.483894407749176, "sampling/importance_sampling_ratio/max": 2.8797943592071533, "sampling/importance_sampling_ratio/mean": 0.8044324517250061, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4811556339263916, "sampling/sampling_logp_difference/mean": 0.013060134835541248, "step": 3, "step_time": 29.402478941949084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 705.8750610351562, "completions/mean_terminated_length": 431.67742919921875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.7134235203266144, "epoch": 0.0037418147801683817, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.8201398253440857, "kl": 0.0026952847256325185, "learning_rate": 9.971936389148737e-06, "loss": 0.1901, "num_tokens": 704409.0, "reward": 0.625, "reward_std": 0.4862987995147705, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.9886536598205566, "sampling/importance_sampling_ratio/mean": 0.8393572568893433, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7230873107910156, "sampling/sampling_logp_difference/mean": 0.014116997830569744, "step": 4, "step_time": 31.27738287183456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 839.6428833007812, "completions/mean_terminated_length": 544.2666625976562, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.6205103173851967, "epoch": 0.004677268475210477, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3034396767616272, "kl": 0.005207642796449363, "learning_rate": 9.962581852198318e-06, "loss": 0.0565, "num_tokens": 879657.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571844935417175, "sampling/importance_sampling_ratio/max": 2.615630626678467, "sampling/importance_sampling_ratio/mean": 0.6636691689491272, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48145532608032227, "sampling/sampling_logp_difference/mean": 0.012481153942644596, "step": 5, "step_time": 32.0280326216016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 803.794677734375, "completions/mean_terminated_length": 533.3152465820312, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.5022454708814621, "epoch": 0.005612722170252572, "frac_reward_zero_std": 0.5, "grad_norm": 0.3532927930355072, "kl": 0.0062537656631320715, "learning_rate": 9.953227315247896e-06, "loss": -0.0414, "num_tokens": 1051098.0, "reward": 0.6696428656578064, "reward_std": 0.47245559096336365, "rewards/math_reward/mean": 0.6696428656578064, "rewards/math_reward/std": 0.47245559096336365, "sampling/importance_sampling_ratio/max": 2.9267385005950928, "sampling/importance_sampling_ratio/mean": 0.8045487999916077, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5577247142791748, "sampling/sampling_logp_difference/mean": 0.010914498940110207, "step": 6, "step_time": 31.184301891596988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 615.6964721679688, "completions/mean_terminated_length": 376.97918701171875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.538397453725338, "epoch": 0.006548175865294668, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.345644474029541, "kl": 0.00916260713711381, "learning_rate": 9.943872778297475e-06, "loss": 0.0383, "num_tokens": 1203048.0, "reward": 0.6785714626312256, "reward_std": 0.469123899936676, "rewards/math_reward/mean": 0.6785714030265808, "rewards/math_reward/std": 0.46912387013435364, "sampling/importance_sampling_ratio/max": 2.793092727661133, "sampling/importance_sampling_ratio/mean": 0.8598646521568298, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4594697952270508, "sampling/sampling_logp_difference/mean": 0.012022511102259159, "step": 7, "step_time": 28.48341524717398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 707.294677734375, "completions/mean_terminated_length": 515.7653198242188, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.5775975584983826, "epoch": 0.007483629560336763, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.5956489443778992, "kl": 0.009204224566929042, "learning_rate": 9.934518241347055e-06, "loss": 0.0827, "num_tokens": 1369113.0, "reward": 0.6517857313156128, "reward_std": 0.47854551672935486, "rewards/math_reward/mean": 0.6517857313156128, "rewards/math_reward/std": 0.47854548692703247, "sampling/importance_sampling_ratio/max": 2.795820474624634, "sampling/importance_sampling_ratio/mean": 0.8030524849891663, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5470898151397705, "sampling/sampling_logp_difference/mean": 0.012449244037270546, "step": 8, "step_time": 30.263077955227345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 714.294677734375, "completions/mean_terminated_length": 369.62921142578125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.6765417531132698, "epoch": 0.00841908325537886, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.5381031632423401, "kl": 0.009070764179341495, "learning_rate": 9.925163704396632e-06, "loss": 0.1203, "num_tokens": 1531378.0, "reward": 0.7321428656578064, "reward_std": 0.44483307003974915, "rewards/math_reward/mean": 0.7321428656578064, "rewards/math_reward/std": 0.44483304023742676, "sampling/importance_sampling_ratio/max": 2.6136107444763184, "sampling/importance_sampling_ratio/mean": 0.8340920209884644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5968514680862427, "sampling/sampling_logp_difference/mean": 0.01244332455098629, "step": 9, "step_time": 31.74206787464209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 839.3660888671875, "completions/mean_terminated_length": 436.48809814453125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.6448270604014397, "epoch": 0.009354536950420954, "frac_reward_zero_std": 0.5, "grad_norm": 0.26989978551864624, "kl": 0.010572452330961823, "learning_rate": 9.915809167446212e-06, "loss": 0.0095, "num_tokens": 1705851.0, "reward": 0.6428571939468384, "reward_std": 0.48131096363067627, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.48131096363067627, "sampling/importance_sampling_ratio/max": 2.6351191997528076, "sampling/importance_sampling_ratio/mean": 0.7158110737800598, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0278384685516357, "sampling/sampling_logp_difference/mean": 0.012709309346973896, "step": 10, "step_time": 33.6895954310894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2946428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 923.9464721679688, "completions/mean_terminated_length": 454.4050598144531, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.9221271872520447, "epoch": 0.01028999064546305, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.4008435606956482, "kl": 0.013059573015198112, "learning_rate": 9.906454630495791e-06, "loss": 0.179, "num_tokens": 1890461.0, "reward": 0.7321428656578064, "reward_std": 0.44483307003974915, "rewards/math_reward/mean": 0.7321428656578064, "rewards/math_reward/std": 0.44483304023742676, "sampling/importance_sampling_ratio/max": 2.749336004257202, "sampling/importance_sampling_ratio/mean": 0.726164698600769, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8405246734619141, "sampling/sampling_logp_difference/mean": 0.014502194710075855, "step": 11, "step_time": 34.555255440063775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4017857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 1046.919677734375, "completions/mean_terminated_length": 374.5522155761719, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 1.1787984073162079, "epoch": 0.011225444340505144, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.38906097412109375, "kl": 0.012733657844364643, "learning_rate": 9.89710009354537e-06, "loss": 0.1141, "num_tokens": 2090212.0, "reward": 0.6339285969734192, "reward_std": 0.483894407749176, "rewards/math_reward/mean": 0.6339285969734192, "rewards/math_reward/std": 0.483894407749176, "sampling/importance_sampling_ratio/max": 2.7673048973083496, "sampling/importance_sampling_ratio/mean": 0.7277251482009888, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7925352454185486, "sampling/sampling_logp_difference/mean": 0.01673109270632267, "step": 12, "step_time": 35.76457388023846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.392857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 1090.7232666015625, "completions/mean_terminated_length": 471.308837890625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.8907198160886765, "epoch": 0.01216089803554724, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.9401674866676331, "kl": 0.015706030884757638, "learning_rate": 9.88774555659495e-06, "loss": 0.0721, "num_tokens": 2295381.0, "reward": 0.5089285969734192, "reward_std": 0.5021671056747437, "rewards/math_reward/mean": 0.5089285969734192, "rewards/math_reward/std": 0.5021671056747437, "sampling/importance_sampling_ratio/max": 2.9469563961029053, "sampling/importance_sampling_ratio/mean": 0.7219957709312439, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4720640182495117, "sampling/sampling_logp_difference/mean": 0.01425548829138279, "step": 13, "step_time": 37.55572304991074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 7.374636788881617e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.374636788881617e-05, "completions/clipped_ratio": 0.4821428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 1237.0179443359375, "completions/mean_terminated_length": 481.96551513671875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 1.0219630599021912, "epoch": 0.013096351730589336, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.2448042333126068, "kl": 0.015580840641632676, "learning_rate": 9.87839101964453e-06, "loss": -0.0103, "num_tokens": 2516455.0, "reward": 0.4642857313156128, "reward_std": 0.500964343547821, "rewards/math_reward/mean": 0.4642857015132904, "rewards/math_reward/std": 0.5009642839431763, "sampling/importance_sampling_ratio/max": 2.9535210132598877, "sampling/importance_sampling_ratio/mean": 0.5689948797225952, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5285539627075195, "sampling/sampling_logp_difference/mean": 0.015638306736946106, "step": 14, "step_time": 39.59749531862326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3482142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 986.794677734375, "completions/mean_terminated_length": 419.84930419921875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.9540613144636154, "epoch": 0.01403180542563143, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.36405766010284424, "kl": 0.015417205868288875, "learning_rate": 9.869036482694107e-06, "loss": 0.0043, "num_tokens": 2708760.0, "reward": 0.5714285969734192, "reward_std": 0.4970957934856415, "rewards/math_reward/mean": 0.5714285969734192, "rewards/math_reward/std": 0.4970957934856415, "sampling/importance_sampling_ratio/max": 2.9213130474090576, "sampling/importance_sampling_ratio/mean": 0.8028447031974792, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4943571090698242, "sampling/sampling_logp_difference/mean": 0.015184266492724419, "step": 15, "step_time": 35.24043712695129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3660714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 961.4375610351562, "completions/mean_terminated_length": 333.98590087890625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.78315868973732, "epoch": 0.014967259120673527, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.329635351896286, "kl": 0.018253440503031015, "learning_rate": 9.859681945743686e-06, "loss": 0.1379, "num_tokens": 2897121.0, "reward": 0.5892857313156128, "reward_std": 0.4941745698451996, "rewards/math_reward/mean": 0.5892857313156128, "rewards/math_reward/std": 0.4941745698451996, "sampling/importance_sampling_ratio/max": 2.792137622833252, "sampling/importance_sampling_ratio/mean": 0.7380985617637634, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5852257013320923, "sampling/sampling_logp_difference/mean": 0.014503950253129005, "step": 16, "step_time": 34.69490266079083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 909.3750610351562, "completions/mean_terminated_length": 453.9250183105469, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.8307785540819168, "epoch": 0.015902712815715623, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.4667915105819702, "kl": 0.01740267500281334, "learning_rate": 9.850327408793266e-06, "loss": 0.0599, "num_tokens": 3081179.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.5781314373016357, "sampling/importance_sampling_ratio/mean": 0.8151792883872986, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5732468366622925, "sampling/sampling_logp_difference/mean": 0.014950604178011417, "step": 17, "step_time": 34.24177399207838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3660714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1015.83935546875, "completions/mean_terminated_length": 419.80279541015625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.8995129019021988, "epoch": 0.01683816651075772, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.26191580295562744, "kl": 0.017754083033651114, "learning_rate": 9.840972871842845e-06, "loss": -0.0259, "num_tokens": 3275529.0, "reward": 0.6160714626312256, "reward_std": 0.48852667212486267, "rewards/math_reward/mean": 0.6160714030265808, "rewards/math_reward/std": 0.4885266423225403, "sampling/importance_sampling_ratio/max": 2.690437078475952, "sampling/importance_sampling_ratio/mean": 0.7544397711753845, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7388219833374023, "sampling/sampling_logp_difference/mean": 0.01415286399424076, "step": 18, "step_time": 35.81371890520677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3035714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 997.0714721679688, "completions/mean_terminated_length": 538.974365234375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.7967480421066284, "epoch": 0.01777362020579981, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.44262245297431946, "kl": 0.018978295382112265, "learning_rate": 9.831618334892423e-06, "loss": 0.1253, "num_tokens": 3469241.0, "reward": 0.6339285969734192, "reward_std": 0.483894407749176, "rewards/math_reward/mean": 0.6339285969734192, "rewards/math_reward/std": 0.483894407749176, "sampling/importance_sampling_ratio/max": 2.7457501888275146, "sampling/importance_sampling_ratio/mean": 0.6835412979125977, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6868505477905273, "sampling/sampling_logp_difference/mean": 0.01513979583978653, "step": 19, "step_time": 35.553459625924006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.267857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 836.857177734375, "completions/mean_terminated_length": 393.7560729980469, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.9107038676738739, "epoch": 0.018709073900841908, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.373272567987442, "kl": 0.021745067089796066, "learning_rate": 9.822263797942002e-06, "loss": 0.0752, "num_tokens": 3646481.0, "reward": 0.598214328289032, "reward_std": 0.49246248602867126, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.4924624562263489, "sampling/importance_sampling_ratio/max": 2.449460029602051, "sampling/importance_sampling_ratio/mean": 0.66705322265625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7748486995697021, "sampling/sampling_logp_difference/mean": 0.017273273319005966, "step": 20, "step_time": 33.3480322828982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3750000298023224, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1100.321533203125, "completions/mean_terminated_length": 531.7142944335938, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 1.0537191033363342, "epoch": 0.019644527595884004, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.24656575918197632, "kl": 0.02032550238072872, "learning_rate": 9.812909260991581e-06, "loss": 0.0394, "num_tokens": 3849677.0, "reward": 0.5892857313156128, "reward_std": 0.4941745698451996, "rewards/math_reward/mean": 0.5892857313156128, "rewards/math_reward/std": 0.4941745698451996, "sampling/importance_sampling_ratio/max": 2.9280760288238525, "sampling/importance_sampling_ratio/mean": 0.6724278330802917, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48715853691101074, "sampling/sampling_logp_difference/mean": 0.01673627272248268, "step": 21, "step_time": 36.86821016995236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2767857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 845.7678833007812, "completions/mean_terminated_length": 385.6543273925781, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.9035803377628326, "epoch": 0.0205799812909261, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.31861916184425354, "kl": 0.01997462660074234, "learning_rate": 9.80355472404116e-06, "loss": 0.033, "num_tokens": 4023475.0, "reward": 0.6160714626312256, "reward_std": 0.48852667212486267, "rewards/math_reward/mean": 0.6160714030265808, "rewards/math_reward/std": 0.4885266423225403, "sampling/importance_sampling_ratio/max": 2.5504188537597656, "sampling/importance_sampling_ratio/mean": 0.7138457298278809, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4990367889404297, "sampling/sampling_logp_difference/mean": 0.0157103780657053, "step": 22, "step_time": 34.08600211213343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 814.4107666015625, "completions/mean_terminated_length": 403.21429443359375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.7514698207378387, "epoch": 0.021515434985968196, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.45147380232810974, "kl": 0.018756817560642958, "learning_rate": 9.79420018709074e-06, "loss": 0.1735, "num_tokens": 4199905.0, "reward": 0.6875000596046448, "reward_std": 0.4655956029891968, "rewards/math_reward/mean": 0.6875, "rewards/math_reward/std": 0.4655956029891968, "sampling/importance_sampling_ratio/max": 2.7556183338165283, "sampling/importance_sampling_ratio/mean": 0.7563361525535583, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46568727493286133, "sampling/sampling_logp_difference/mean": 0.014221079647541046, "step": 23, "step_time": 34.27125357929617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2767857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 877.8928833007812, "completions/mean_terminated_length": 430.0740661621094, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.7505601644515991, "epoch": 0.02245088868101029, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3539901077747345, "kl": 0.017586843576282263, "learning_rate": 9.78484565014032e-06, "loss": 0.0966, "num_tokens": 4383333.0, "reward": 0.625, "reward_std": 0.4862987697124481, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.872218608856201, "sampling/importance_sampling_ratio/mean": 0.6913287043571472, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5624688863754272, "sampling/sampling_logp_difference/mean": 0.014125186018645763, "step": 24, "step_time": 34.95869205077179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 918.71435546875, "completions/mean_terminated_length": 405.402587890625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 1.013735219836235, "epoch": 0.023386342376052385, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.21617349982261658, "kl": 0.020565785467624664, "learning_rate": 9.775491113189897e-06, "loss": 0.0087, "num_tokens": 4573125.0, "reward": 0.598214328289032, "reward_std": 0.49246248602867126, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.49246248602867126, "sampling/importance_sampling_ratio/max": 2.743206739425659, "sampling/importance_sampling_ratio/mean": 0.7455167174339294, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49198389053344727, "sampling/sampling_logp_difference/mean": 0.01639604941010475, "step": 25, "step_time": 35.95340558886528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3035714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 959.919677734375, "completions/mean_terminated_length": 485.6282043457031, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.7583374977111816, "epoch": 0.02432179607109448, "frac_reward_zero_std": 0.5, "grad_norm": 0.19719302654266357, "kl": 0.01726222736760974, "learning_rate": 9.766136576239477e-06, "loss": -0.0029, "num_tokens": 4754100.0, "reward": 0.5625, "reward_std": 0.4983079433441162, "rewards/math_reward/mean": 0.5625, "rewards/math_reward/std": 0.4983079433441162, "sampling/importance_sampling_ratio/max": 2.7599663734436035, "sampling/importance_sampling_ratio/mean": 0.7688745856285095, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4517171382904053, "sampling/sampling_logp_difference/mean": 0.013558655977249146, "step": 26, "step_time": 34.57535572396591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2767857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 817.9642944335938, "completions/mean_terminated_length": 347.2098693847656, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.9060320556163788, "epoch": 0.025257249766136577, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.25301623344421387, "kl": 0.021052172873169184, "learning_rate": 9.756782039289056e-06, "loss": -0.0451, "num_tokens": 4927776.0, "reward": 0.5714285969734192, "reward_std": 0.49709582328796387, "rewards/math_reward/mean": 0.5714285969734192, "rewards/math_reward/std": 0.4970957934856415, "sampling/importance_sampling_ratio/max": 2.693929672241211, "sampling/importance_sampling_ratio/mean": 0.6569679379463196, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6866680979728699, "sampling/sampling_logp_difference/mean": 0.0174298956990242, "step": 27, "step_time": 33.785803941078484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2767857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 878.8035888671875, "completions/mean_terminated_length": 431.3333435058594, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 1.0151092112064362, "epoch": 0.026192703461178673, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.4979371428489685, "kl": 0.018488436937332153, "learning_rate": 9.747427502338635e-06, "loss": 0.1178, "num_tokens": 5110306.0, "reward": 0.6428571939468384, "reward_std": 0.48131096363067627, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.4813109338283539, "sampling/importance_sampling_ratio/max": 2.857450485229492, "sampling/importance_sampling_ratio/mean": 0.7366995215415955, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48923778533935547, "sampling/sampling_logp_difference/mean": 0.016557222232222557, "step": 28, "step_time": 34.287768678274006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.267857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 900.3125610351562, "completions/mean_terminated_length": 480.42681884765625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.7308783680200577, "epoch": 0.027128157156220765, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.21552662551403046, "kl": 0.018320413306355476, "learning_rate": 9.738072965388215e-06, "loss": 0.0066, "num_tokens": 5291725.0, "reward": 0.535714328289032, "reward_std": 0.500964343547821, "rewards/math_reward/mean": 0.5357142686843872, "rewards/math_reward/std": 0.5009642839431763, "sampling/importance_sampling_ratio/max": 2.967113971710205, "sampling/importance_sampling_ratio/mean": 0.7355402708053589, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5293288230895996, "sampling/sampling_logp_difference/mean": 0.013998405076563358, "step": 29, "step_time": 34.62437942624092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00014111041309661232, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00014111041309661232, "completions/clipped_ratio": 0.2589285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 863.4732666015625, "completions/mean_terminated_length": 449.6023864746094, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.8205512166023254, "epoch": 0.02806361085126286, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.27263525128364563, "kl": 0.018355075269937515, "learning_rate": 9.728718428437794e-06, "loss": 0.0749, "num_tokens": 5474698.0, "reward": 0.598214328289032, "reward_std": 0.49246248602867126, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.49246248602867126, "sampling/importance_sampling_ratio/max": 2.8499417304992676, "sampling/importance_sampling_ratio/mean": 0.7751789689064026, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0748112201690674, "sampling/sampling_logp_difference/mean": 0.013828633353114128, "step": 30, "step_time": 34.22921681427397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.267857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 856.9732666015625, "completions/mean_terminated_length": 421.231689453125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.7285836637020111, "epoch": 0.028999064546304958, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.27999743819236755, "kl": 0.017499278066679835, "learning_rate": 9.719363891487372e-06, "loss": -0.0356, "num_tokens": 5652855.0, "reward": 0.6339285969734192, "reward_std": 0.483894407749176, "rewards/math_reward/mean": 0.6339285969734192, "rewards/math_reward/std": 0.483894407749176, "sampling/importance_sampling_ratio/max": 2.6144497394561768, "sampling/importance_sampling_ratio/mean": 0.7833837866783142, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7187361717224121, "sampling/sampling_logp_difference/mean": 0.014114965684711933, "step": 31, "step_time": 33.257148538250476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 710.3482666015625, "completions/mean_terminated_length": 383.3666687011719, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.743230514228344, "epoch": 0.029934518241347054, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.2426978349685669, "kl": 0.019547419156879187, "learning_rate": 9.710009354536951e-06, "loss": 0.015, "num_tokens": 5816446.0, "reward": 0.8750000596046448, "reward_std": 0.33220529556274414, "rewards/math_reward/mean": 0.875, "rewards/math_reward/std": 0.33220529556274414, "sampling/importance_sampling_ratio/max": 2.5459535121917725, "sampling/importance_sampling_ratio/mean": 0.7554444074630737, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7576909065246582, "sampling/sampling_logp_difference/mean": 0.013645714148879051, "step": 32, "step_time": 30.289431625045836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 747.4464721679688, "completions/mean_terminated_length": 392.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.6182110533118248, "epoch": 0.03086997193638915, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.4938831627368927, "kl": 0.019201325718313456, "learning_rate": 9.70065481758653e-06, "loss": 0.1074, "num_tokens": 5984440.0, "reward": 0.7053571939468384, "reward_std": 0.45793095231056213, "rewards/math_reward/mean": 0.7053571343421936, "rewards/math_reward/std": 0.45793095231056213, "sampling/importance_sampling_ratio/max": 2.768648147583008, "sampling/importance_sampling_ratio/mean": 0.7364203333854675, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5175700187683105, "sampling/sampling_logp_difference/mean": 0.013598913326859474, "step": 33, "step_time": 31.07748026982881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.267857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 917.3214721679688, "completions/mean_terminated_length": 503.65850830078125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.6338589936494827, "epoch": 0.031805425631431246, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.19577626883983612, "kl": 0.017656521871685982, "learning_rate": 9.691300280636108e-06, "loss": 0.0519, "num_tokens": 6169012.0, "reward": 0.5892857313156128, "reward_std": 0.4941745698451996, "rewards/math_reward/mean": 0.5892857313156128, "rewards/math_reward/std": 0.4941745400428772, "sampling/importance_sampling_ratio/max": 2.8688361644744873, "sampling/importance_sampling_ratio/mean": 0.6736416816711426, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4905414581298828, "sampling/sampling_logp_difference/mean": 0.013752533122897148, "step": 34, "step_time": 34.61162173678167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 700.3125610351562, "completions/mean_terminated_length": 407.33697509765625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.6134531199932098, "epoch": 0.03274087932647334, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.7360969185829163, "kl": 0.016913133906200528, "learning_rate": 9.681945743685688e-06, "loss": 0.1808, "num_tokens": 6329175.0, "reward": 0.7410714626312256, "reward_std": 0.4400150775909424, "rewards/math_reward/mean": 0.7410714030265808, "rewards/math_reward/std": 0.44001504778862, "sampling/importance_sampling_ratio/max": 2.8359134197235107, "sampling/importance_sampling_ratio/mean": 0.8083340525627136, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7433958053588867, "sampling/sampling_logp_difference/mean": 0.013467836193740368, "step": 35, "step_time": 29.667832166189328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 845.294677734375, "completions/mean_terminated_length": 425.072265625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.5870131179690361, "epoch": 0.03367633302151544, "frac_reward_zero_std": 0.5, "grad_norm": 0.46825891733169556, "kl": 0.01732715731486678, "learning_rate": 9.672591206735269e-06, "loss": 0.1096, "num_tokens": 6503720.0, "reward": 0.6785714626312256, "reward_std": 0.46912387013435364, "rewards/math_reward/mean": 0.6785714030265808, "rewards/math_reward/std": 0.46912387013435364, "sampling/importance_sampling_ratio/max": 2.6529598236083984, "sampling/importance_sampling_ratio/mean": 0.7407724261283875, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47348499298095703, "sampling/sampling_logp_difference/mean": 0.012951518408954144, "step": 36, "step_time": 33.49017690937035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 680.1517944335938, "completions/mean_terminated_length": 418.223388671875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.5283501520752907, "epoch": 0.03461178671655753, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.3757780194282532, "kl": 0.01834326656535268, "learning_rate": 9.663236669784846e-06, "loss": 0.0729, "num_tokens": 6665537.0, "reward": 0.6160714626312256, "reward_std": 0.48852667212486267, "rewards/math_reward/mean": 0.6160714030265808, "rewards/math_reward/std": 0.4885266423225403, "sampling/importance_sampling_ratio/max": 2.7194414138793945, "sampling/importance_sampling_ratio/mean": 0.8215468525886536, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49709129333496094, "sampling/sampling_logp_difference/mean": 0.011810969561338425, "step": 37, "step_time": 31.666325060185045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 841.2767944335938, "completions/mean_terminated_length": 494.5172424316406, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.7155907154083252, "epoch": 0.03554724041159962, "frac_reward_zero_std": 0.5, "grad_norm": 0.20322710275650024, "kl": 0.017374981194734573, "learning_rate": 9.653882132834426e-06, "loss": -0.0434, "num_tokens": 6841336.0, "reward": 0.6160714626312256, "reward_std": 0.48852667212486267, "rewards/math_reward/mean": 0.6160714030265808, "rewards/math_reward/std": 0.4885266423225403, "sampling/importance_sampling_ratio/max": 2.145857095718384, "sampling/importance_sampling_ratio/mean": 0.7075915932655334, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4943680763244629, "sampling/sampling_logp_difference/mean": 0.012724591419100761, "step": 38, "step_time": 32.58959316089749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 862.4107666015625, "completions/mean_terminated_length": 503.97674560546875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5596742257475853, "epoch": 0.03648269410664172, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.302670955657959, "kl": 0.01660713250748813, "learning_rate": 9.644527595884005e-06, "loss": 0.0357, "num_tokens": 7018478.0, "reward": 0.5, "reward_std": 0.5022472143173218, "rewards/math_reward/mean": 0.5, "rewards/math_reward/std": 0.5022472143173218, "sampling/importance_sampling_ratio/max": 2.9198997020721436, "sampling/importance_sampling_ratio/mean": 0.7460393309593201, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49065494537353516, "sampling/sampling_logp_difference/mean": 0.012680876068770885, "step": 39, "step_time": 32.935591471148655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1484.0, "completions/mean_length": 823.3482666015625, "completions/mean_terminated_length": 489.352294921875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.8091948628425598, "epoch": 0.037418147801683815, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.21990777552127838, "kl": 0.018513760529458523, "learning_rate": 9.635173058933583e-06, "loss": -0.0507, "num_tokens": 7198309.0, "reward": 0.5535714626312256, "reward_std": 0.49935606122016907, "rewards/math_reward/mean": 0.5535714030265808, "rewards/math_reward/std": 0.49935609102249146, "sampling/importance_sampling_ratio/max": 2.8607513904571533, "sampling/importance_sampling_ratio/mean": 0.7620623707771301, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49661827087402344, "sampling/sampling_logp_difference/mean": 0.013555042445659637, "step": 40, "step_time": 33.810600000666454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 746.0803833007812, "completions/mean_terminated_length": 513.1052856445312, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.5743209645152092, "epoch": 0.03835360149672591, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.36449968814849854, "kl": 0.017925681080669165, "learning_rate": 9.625818521983162e-06, "loss": 0.023, "num_tokens": 7361670.0, "reward": 0.660714328289032, "reward_std": 0.4755948781967163, "rewards/math_reward/mean": 0.6607142686843872, "rewards/math_reward/std": 0.4755948781967163, "sampling/importance_sampling_ratio/max": 2.8446075916290283, "sampling/importance_sampling_ratio/mean": 0.7403399348258972, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49387502670288086, "sampling/sampling_logp_difference/mean": 0.011374018155038357, "step": 41, "step_time": 30.55090121086687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 794.4107666015625, "completions/mean_terminated_length": 452.5227355957031, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.5380754619836807, "epoch": 0.03928905519176801, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.34910598397254944, "kl": 0.017142182681709528, "learning_rate": 9.616463985032741e-06, "loss": 0.0604, "num_tokens": 7529116.0, "reward": 0.625, "reward_std": 0.4862987697124481, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.997293472290039, "sampling/importance_sampling_ratio/mean": 0.9083004593849182, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48091357946395874, "sampling/sampling_logp_difference/mean": 0.010413939133286476, "step": 42, "step_time": 30.967001381795853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 788.0535888671875, "completions/mean_terminated_length": 426.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.648337297141552, "epoch": 0.040224508886810104, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3533414900302887, "kl": 0.016745051834732294, "learning_rate": 9.607109448082321e-06, "loss": 0.0617, "num_tokens": 7699930.0, "reward": 0.6517857313156128, "reward_std": 0.47854551672935486, "rewards/math_reward/mean": 0.6517857313156128, "rewards/math_reward/std": 0.47854548692703247, "sampling/importance_sampling_ratio/max": 2.0536386966705322, "sampling/importance_sampling_ratio/mean": 0.7103310823440552, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49375009536743164, "sampling/sampling_logp_difference/mean": 0.011771543882787228, "step": 43, "step_time": 31.775261400965974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.330357164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 1006.419677734375, "completions/mean_terminated_length": 492.5733337402344, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.6750987879931927, "epoch": 0.0411599625818522, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.30343326926231384, "kl": 0.013880337588489056, "learning_rate": 9.5977549111319e-06, "loss": 0.1025, "num_tokens": 7893593.0, "reward": 0.5892857313156128, "reward_std": 0.4941745698451996, "rewards/math_reward/mean": 0.5892857313156128, "rewards/math_reward/std": 0.4941745698451996, "sampling/importance_sampling_ratio/max": 2.933511972427368, "sampling/importance_sampling_ratio/mean": 0.8482525944709778, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7177674770355225, "sampling/sampling_logp_difference/mean": 0.011876443400979042, "step": 44, "step_time": 35.32452671416104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1590.0, "completions/mean_length": 755.9642944335938, "completions/mean_terminated_length": 422.0674133300781, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.523250125348568, "epoch": 0.042095416276894296, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.37776198983192444, "kl": 0.01600645831786096, "learning_rate": 9.58840037418148e-06, "loss": 0.0627, "num_tokens": 8061157.0, "reward": 0.723214328289032, "reward_std": 0.4494204819202423, "rewards/math_reward/mean": 0.7232142686843872, "rewards/math_reward/std": 0.4494204819202423, "sampling/importance_sampling_ratio/max": 2.9380221366882324, "sampling/importance_sampling_ratio/mean": 0.7887592315673828, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49698305130004883, "sampling/sampling_logp_difference/mean": 0.011541066691279411, "step": 45, "step_time": 30.86015928699635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 838.6875610351562, "completions/mean_terminated_length": 473.0813903808594, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.5469966307282448, "epoch": 0.04303086997193639, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.26889777183532715, "kl": 0.018361144931986928, "learning_rate": 9.579045837231057e-06, "loss": 0.0523, "num_tokens": 8239954.0, "reward": 0.5535714626312256, "reward_std": 0.49935612082481384, "rewards/math_reward/mean": 0.5535714030265808, "rewards/math_reward/std": 0.49935609102249146, "sampling/importance_sampling_ratio/max": 2.9643144607543945, "sampling/importance_sampling_ratio/mean": 0.7331107258796692, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49621057510375977, "sampling/sampling_logp_difference/mean": 0.01167861744761467, "step": 46, "step_time": 32.72249426343478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 670.6428833007812, "completions/mean_terminated_length": 406.89361572265625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.5179079473018646, "epoch": 0.04396632366697849, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.7799696326255798, "kl": 0.017839635722339153, "learning_rate": 9.569691300280637e-06, "loss": 0.2546, "num_tokens": 8397194.0, "reward": 0.7321428656578064, "reward_std": 0.44483304023742676, "rewards/math_reward/mean": 0.7321428656578064, "rewards/math_reward/std": 0.44483304023742676, "sampling/importance_sampling_ratio/max": 2.915696382522583, "sampling/importance_sampling_ratio/mean": 0.8681398034095764, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46018147468566895, "sampling/sampling_logp_difference/mean": 0.010896636173129082, "step": 47, "step_time": 29.64796581096016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1623.0, "completions/mean_length": 899.0982666015625, "completions/mean_terminated_length": 568.9540405273438, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.5863955542445183, "epoch": 0.04490177736202058, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.16265104711055756, "kl": 0.017523460322991014, "learning_rate": 9.560336763330216e-06, "loss": 0.0036, "num_tokens": 8576285.0, "reward": 0.392857164144516, "reward_std": 0.4905805289745331, "rewards/math_reward/mean": 0.3928571343421936, "rewards/math_reward/std": 0.4905805289745331, "sampling/importance_sampling_ratio/max": 2.4639525413513184, "sampling/importance_sampling_ratio/mean": 0.6832291483879089, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5204067230224609, "sampling/sampling_logp_difference/mean": 0.012079255655407906, "step": 48, "step_time": 32.83921501529403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 714.1250610351562, "completions/mean_terminated_length": 441.6129150390625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4434436932206154, "epoch": 0.04583723105706267, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.248118594288826, "kl": 0.01916261320002377, "learning_rate": 9.550982226379795e-06, "loss": 0.0209, "num_tokens": 8740171.0, "reward": 0.6696428656578064, "reward_std": 0.47245559096336365, "rewards/math_reward/mean": 0.6696428656578064, "rewards/math_reward/std": 0.47245559096336365, "sampling/importance_sampling_ratio/max": 2.176529884338379, "sampling/importance_sampling_ratio/mean": 0.7194594144821167, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4895644187927246, "sampling/sampling_logp_difference/mean": 0.010229106992483139, "step": 49, "step_time": 30.442159808939323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1493.0, "completions/mean_length": 726.9107666015625, "completions/mean_terminated_length": 457.0107421875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.9446404427289963, "epoch": 0.04677268475210477, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.29597824811935425, "kl": 0.02105015004053712, "learning_rate": 9.541627689429373e-06, "loss": 0.007, "num_tokens": 8900017.0, "reward": 0.5625, "reward_std": 0.4983079731464386, "rewards/math_reward/mean": 0.5625, "rewards/math_reward/std": 0.4983079433441162, "sampling/importance_sampling_ratio/max": 2.878451347351074, "sampling/importance_sampling_ratio/mean": 0.771983802318573, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5115871429443359, "sampling/sampling_logp_difference/mean": 0.01634705811738968, "step": 50, "step_time": 30.528939883224666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 458.4375305175781, "completions/mean_terminated_length": 352.4666748046875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.49947137385606766, "epoch": 0.047708138447146865, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.47253841161727905, "kl": 0.020557332783937454, "learning_rate": 9.532273152478954e-06, "loss": -0.0084, "num_tokens": 9036370.0, "reward": 0.7142857313156128, "reward_std": 0.453784316778183, "rewards/math_reward/mean": 0.7142857313156128, "rewards/math_reward/std": 0.4537842869758606, "sampling/importance_sampling_ratio/max": 2.584535837173462, "sampling/importance_sampling_ratio/mean": 0.9255010485649109, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.461733341217041, "sampling/sampling_logp_difference/mean": 0.01150066964328289, "step": 51, "step_time": 26.491471647983417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 533.25, "completions/mean_terminated_length": 368.2772216796875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.4760161563754082, "epoch": 0.04864359214218896, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.33404678106307983, "kl": 0.02161170681938529, "learning_rate": 9.522918615528532e-06, "loss": -0.0025, "num_tokens": 9183654.0, "reward": 0.7500000596046448, "reward_std": 0.43495887517929077, "rewards/math_reward/mean": 0.75, "rewards/math_reward/std": 0.4349588453769684, "sampling/importance_sampling_ratio/max": 2.977421283721924, "sampling/importance_sampling_ratio/mean": 0.8042804002761841, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49817895889282227, "sampling/sampling_logp_difference/mean": 0.011124475859105587, "step": 52, "step_time": 27.825897573959082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 734.4910888671875, "completions/mean_terminated_length": 482.96807861328125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.7460342273116112, "epoch": 0.04957904583723106, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.28653913736343384, "kl": 0.022133768070489168, "learning_rate": 9.513564078578111e-06, "loss": 0.0284, "num_tokens": 9347581.0, "reward": 0.625, "reward_std": 0.4862987995147705, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.679694175720215, "sampling/importance_sampling_ratio/mean": 0.7457960844039917, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4740779399871826, "sampling/sampling_logp_difference/mean": 0.014734752476215363, "step": 53, "step_time": 30.455030621262267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.133928582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 623.044677734375, "completions/mean_terminated_length": 402.6907043457031, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.49333614110946655, "epoch": 0.050514499532273154, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3566436171531677, "kl": 0.01992735406383872, "learning_rate": 9.50420954162769e-06, "loss": 0.0696, "num_tokens": 9499594.0, "reward": 0.7589285969734192, "reward_std": 0.4296559691429138, "rewards/math_reward/mean": 0.7589285969734192, "rewards/math_reward/std": 0.4296559691429138, "sampling/importance_sampling_ratio/max": 2.9384725093841553, "sampling/importance_sampling_ratio/mean": 0.8593305349349976, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.43727564811706543, "sampling/sampling_logp_difference/mean": 0.010413090698421001, "step": 54, "step_time": 28.619269684888422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.133928582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 706.1250610351562, "completions/mean_terminated_length": 498.6185302734375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.5732534229755402, "epoch": 0.05144995322731525, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.4857997000217438, "kl": 0.018539296463131905, "learning_rate": 9.49485500467727e-06, "loss": 0.1551, "num_tokens": 9662024.0, "reward": 0.5625, "reward_std": 0.4983079731464386, "rewards/math_reward/mean": 0.5625, "rewards/math_reward/std": 0.4983079433441162, "sampling/importance_sampling_ratio/max": 2.9853203296661377, "sampling/importance_sampling_ratio/mean": 0.7511168122291565, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5829417705535889, "sampling/sampling_logp_difference/mean": 0.012209615670144558, "step": 55, "step_time": 30.479912928538397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 573.5267944335938, "completions/mean_terminated_length": 362.88775634765625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.7207027524709702, "epoch": 0.052385406922357346, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.3442149758338928, "kl": 0.020023624412715435, "learning_rate": 9.485500467726848e-06, "loss": 0.0348, "num_tokens": 9811499.0, "reward": 0.7767857313156128, "reward_std": 0.41827234625816345, "rewards/math_reward/mean": 0.7767857313156128, "rewards/math_reward/std": 0.41827234625816345, "sampling/importance_sampling_ratio/max": 2.7802774906158447, "sampling/importance_sampling_ratio/mean": 0.7461709976196289, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49312496185302734, "sampling/sampling_logp_difference/mean": 0.014069957658648491, "step": 56, "step_time": 28.127612228039652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 684.5000610351562, "completions/mean_terminated_length": 423.40423583984375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.5792168974876404, "epoch": 0.05332086061739944, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3714529871940613, "kl": 0.019985825289040804, "learning_rate": 9.476145930776427e-06, "loss": 0.0668, "num_tokens": 9968571.0, "reward": 0.6964285969734192, "reward_std": 0.4618662893772125, "rewards/math_reward/mean": 0.6964285969734192, "rewards/math_reward/std": 0.4618662893772125, "sampling/importance_sampling_ratio/max": 2.777597427368164, "sampling/importance_sampling_ratio/mean": 0.7966075539588928, "sampling/importance_sampling_ratio/min": 0.014431014657020569, "sampling/sampling_logp_difference/max": 0.4822111129760742, "sampling/sampling_logp_difference/mean": 0.012437769211828709, "step": 57, "step_time": 29.339288994902745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 824.5357666015625, "completions/mean_terminated_length": 525.4666748046875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.7217714339494705, "epoch": 0.05425631431244153, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.23752249777317047, "kl": 0.020568890497088432, "learning_rate": 9.466791393826006e-06, "loss": -0.0448, "num_tokens": 10144831.0, "reward": 0.5, "reward_std": 0.5022472143173218, "rewards/math_reward/mean": 0.5, "rewards/math_reward/std": 0.5022472143173218, "sampling/importance_sampling_ratio/max": 2.54487943649292, "sampling/importance_sampling_ratio/mean": 0.7428828477859497, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8435235023498535, "sampling/sampling_logp_difference/mean": 0.01342224795371294, "step": 58, "step_time": 31.957725453190506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 613.9553833007812, "completions/mean_terminated_length": 488.6504821777344, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.5001910850405693, "epoch": 0.05519176800748363, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.32284459471702576, "kl": 0.018960077315568924, "learning_rate": 9.457436856875586e-06, "loss": -0.0525, "num_tokens": 10299906.0, "reward": 0.6517857313156128, "reward_std": 0.47854551672935486, "rewards/math_reward/mean": 0.6517857313156128, "rewards/math_reward/std": 0.47854551672935486, "sampling/importance_sampling_ratio/max": 2.4928781986236572, "sampling/importance_sampling_ratio/mean": 0.8286952376365662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48565101623535156, "sampling/sampling_logp_difference/mean": 0.012194250710308552, "step": 59, "step_time": 28.37260682391934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 727.2142944335938, "completions/mean_terminated_length": 422.4176025390625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.695380762219429, "epoch": 0.05612722170252572, "frac_reward_zero_std": 0.5, "grad_norm": 0.2540228068828583, "kl": 0.020683465991169214, "learning_rate": 9.448082319925165e-06, "loss": 0.0642, "num_tokens": 10466978.0, "reward": 0.6517857313156128, "reward_std": 0.47854551672935486, "rewards/math_reward/mean": 0.6517857313156128, "rewards/math_reward/std": 0.47854548692703247, "sampling/importance_sampling_ratio/max": 2.2374610900878906, "sampling/importance_sampling_ratio/mean": 0.7185073494911194, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7263672351837158, "sampling/sampling_logp_difference/mean": 0.014006296172738075, "step": 60, "step_time": 31.83402775460854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 671.2410888671875, "completions/mean_terminated_length": 441.78125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.5000432431697845, "epoch": 0.05706267539756782, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.5439503788948059, "kl": 0.020440965425223112, "learning_rate": 9.438727782974744e-06, "loss": 0.0267, "num_tokens": 10622749.0, "reward": 0.6428571939468384, "reward_std": 0.48131096363067627, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.4813109338283539, "sampling/importance_sampling_ratio/max": 2.8767971992492676, "sampling/importance_sampling_ratio/mean": 0.8917859196662903, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49910926818847656, "sampling/sampling_logp_difference/mean": 0.01123070903122425, "step": 61, "step_time": 29.146830308018252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1692.0, "completions/mean_length": 789.5178833007812, "completions/mean_terminated_length": 532.4086303710938, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.5183047950267792, "epoch": 0.057998129092609915, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.21433782577514648, "kl": 0.01895526284351945, "learning_rate": 9.429373246024322e-06, "loss": 0.081, "num_tokens": 10787535.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.108966827392578, "sampling/importance_sampling_ratio/mean": 0.719986081123352, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49283552169799805, "sampling/sampling_logp_difference/mean": 0.011778203770518303, "step": 62, "step_time": 31.218348156893626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 843.6339721679688, "completions/mean_terminated_length": 497.5517272949219, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.7024109810590744, "epoch": 0.05893358278765201, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.41907572746276855, "kl": 0.02052247617393732, "learning_rate": 9.420018709073902e-06, "loss": 0.0785, "num_tokens": 10963238.0, "reward": 0.5714285969734192, "reward_std": 0.4970957934856415, "rewards/math_reward/mean": 0.5714285969734192, "rewards/math_reward/std": 0.4970957934856415, "sampling/importance_sampling_ratio/max": 2.8878562450408936, "sampling/importance_sampling_ratio/mean": 0.870850145816803, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4658489227294922, "sampling/sampling_logp_difference/mean": 0.013363021425902843, "step": 63, "step_time": 32.5338203846477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 745.2053833007812, "completions/mean_terminated_length": 389.8977355957031, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.7004088312387466, "epoch": 0.05986903648269411, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.4603191614151001, "kl": 0.0222168006002903, "learning_rate": 9.410664172123481e-06, "loss": -0.033, "num_tokens": 11129085.0, "reward": 0.5625, "reward_std": 0.4983079731464386, "rewards/math_reward/mean": 0.5625, "rewards/math_reward/std": 0.4983079433441162, "sampling/importance_sampling_ratio/max": 2.7347192764282227, "sampling/importance_sampling_ratio/mean": 0.8465678095817566, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4079180955886841, "sampling/sampling_logp_difference/mean": 0.014567007310688496, "step": 64, "step_time": 30.82592878001742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 545.419677734375, "completions/mean_terminated_length": 398.10784912109375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.5999510288238525, "epoch": 0.0608044901777362, "frac_reward_zero_std": 0.5, "grad_norm": 0.46167072653770447, "kl": 0.02151336520910263, "learning_rate": 9.401309635173059e-06, "loss": 0.0174, "num_tokens": 11272444.0, "reward": 0.723214328289032, "reward_std": 0.4494204819202423, "rewards/math_reward/mean": 0.7232142686843872, "rewards/math_reward/std": 0.4494204819202423, "sampling/importance_sampling_ratio/max": 2.9650213718414307, "sampling/importance_sampling_ratio/mean": 0.9216145277023315, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6143853664398193, "sampling/sampling_logp_difference/mean": 0.013841314241290092, "step": 65, "step_time": 26.92554700979963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 650.857177734375, "completions/mean_terminated_length": 467.3939514160156, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.5425730645656586, "epoch": 0.0617399438727783, "frac_reward_zero_std": 0.5, "grad_norm": 0.3994305431842804, "kl": 0.018951176200062037, "learning_rate": 9.39195509822264e-06, "loss": 0.0058, "num_tokens": 11423396.0, "reward": 0.6785714626312256, "reward_std": 0.46912387013435364, "rewards/math_reward/mean": 0.6785714030265808, "rewards/math_reward/std": 0.46912387013435364, "sampling/importance_sampling_ratio/max": 2.799588918685913, "sampling/importance_sampling_ratio/mean": 0.8269763588905334, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48838257789611816, "sampling/sampling_logp_difference/mean": 0.012734406627714634, "step": 66, "step_time": 29.26173396036029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 595.3482666015625, "completions/mean_terminated_length": 404.595947265625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.5291729271411896, "epoch": 0.0626753975678204, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.46969571709632874, "kl": 0.02131913462653756, "learning_rate": 9.382600561272219e-06, "loss": 0.0554, "num_tokens": 11574147.0, "reward": 0.6071428656578064, "reward_std": 0.4905805289745331, "rewards/math_reward/mean": 0.6071428656578064, "rewards/math_reward/std": 0.4905804991722107, "sampling/importance_sampling_ratio/max": 2.5890960693359375, "sampling/importance_sampling_ratio/mean": 0.8951422572135925, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46256113052368164, "sampling/sampling_logp_difference/mean": 0.012441848404705524, "step": 67, "step_time": 27.985418412834406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 583.1785888671875, "completions/mean_terminated_length": 321.0526428222656, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.7052773088216782, "epoch": 0.06361085126286249, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.43473148345947266, "kl": 0.02405428420752287, "learning_rate": 9.373246024321797e-06, "loss": 0.0497, "num_tokens": 11721471.0, "reward": 0.8035714626312256, "reward_std": 0.3990819752216339, "rewards/math_reward/mean": 0.8035714030265808, "rewards/math_reward/std": 0.3990819454193115, "sampling/importance_sampling_ratio/max": 2.931300163269043, "sampling/importance_sampling_ratio/mean": 0.926205575466156, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.45746779441833496, "sampling/sampling_logp_difference/mean": 0.014713690616190434, "step": 68, "step_time": 28.19314338406548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 644.0892944335938, "completions/mean_terminated_length": 410.10418701171875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.8531732261180878, "epoch": 0.06454630495790459, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 1.0469344854354858, "kl": 0.02244015596807003, "learning_rate": 9.363891487371376e-06, "loss": 0.0849, "num_tokens": 11878273.0, "reward": 0.6875000596046448, "reward_std": 0.4655956029891968, "rewards/math_reward/mean": 0.6875, "rewards/math_reward/std": 0.4655956029891968, "sampling/importance_sampling_ratio/max": 2.969470739364624, "sampling/importance_sampling_ratio/mean": 0.8139554262161255, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5276983976364136, "sampling/sampling_logp_difference/mean": 0.015675945207476616, "step": 69, "step_time": 29.462264789966866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 567.0, "completions/mean_terminated_length": 389.2799987792969, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.6743750870227814, "epoch": 0.06548175865294668, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.6206972002983093, "kl": 0.02341578621417284, "learning_rate": 9.354536950420955e-06, "loss": 0.1954, "num_tokens": 12024393.0, "reward": 0.6785714626312256, "reward_std": 0.469123899936676, "rewards/math_reward/mean": 0.6785714030265808, "rewards/math_reward/std": 0.46912387013435364, "sampling/importance_sampling_ratio/max": 2.910506010055542, "sampling/importance_sampling_ratio/mean": 0.8511191010475159, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4658932685852051, "sampling/sampling_logp_difference/mean": 0.014093993231654167, "step": 70, "step_time": 28.017500965856016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 699.419677734375, "completions/mean_terminated_length": 423.9032287597656, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.7149751037359238, "epoch": 0.06641721234798878, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.2898215055465698, "kl": 0.02292506769299507, "learning_rate": 9.345182413470533e-06, "loss": 0.0348, "num_tokens": 12181224.0, "reward": 0.5625, "reward_std": 0.4983079731464386, "rewards/math_reward/mean": 0.5625, "rewards/math_reward/std": 0.4983079433441162, "sampling/importance_sampling_ratio/max": 2.7086901664733887, "sampling/importance_sampling_ratio/mean": 0.7828804850578308, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4978141784667969, "sampling/sampling_logp_difference/mean": 0.01338591892272234, "step": 71, "step_time": 30.164368807105348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 752.6428833007812, "completions/mean_terminated_length": 471.0434875488281, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.8071641623973846, "epoch": 0.06735266604303088, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.34986037015914917, "kl": 0.025217668619006872, "learning_rate": 9.335827876520112e-06, "loss": 0.0688, "num_tokens": 12350560.0, "reward": 0.5714285969734192, "reward_std": 0.4970957934856415, "rewards/math_reward/mean": 0.5714285969734192, "rewards/math_reward/std": 0.4970957934856415, "sampling/importance_sampling_ratio/max": 2.4916598796844482, "sampling/importance_sampling_ratio/mean": 0.7683644890785217, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.64920574426651, "sampling/sampling_logp_difference/mean": 0.015260194428265095, "step": 72, "step_time": 33.15771950921044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 796.1964721679688, "completions/mean_terminated_length": 540.4515991210938, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.9866662174463272, "epoch": 0.06828811973807297, "frac_reward_zero_std": 0.5, "grad_norm": 0.1912805289030075, "kl": 0.02386835915967822, "learning_rate": 9.326473339569692e-06, "loss": -0.0195, "num_tokens": 12522478.0, "reward": 0.4642857313156128, "reward_std": 0.500964343547821, "rewards/math_reward/mean": 0.4642857015132904, "rewards/math_reward/std": 0.5009642839431763, "sampling/importance_sampling_ratio/max": 2.4113287925720215, "sampling/importance_sampling_ratio/mean": 0.6561877131462097, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4637186527252197, "sampling/sampling_logp_difference/mean": 0.016834108158946037, "step": 73, "step_time": 31.872864544857293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 692.3660888671875, "completions/mean_terminated_length": 397.6630554199219, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 1.0243710428476334, "epoch": 0.06922357343311505, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.37335821986198425, "kl": 0.024980470538139343, "learning_rate": 9.317118802619271e-06, "loss": -0.0134, "num_tokens": 12682511.0, "reward": 0.598214328289032, "reward_std": 0.49246248602867126, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.49246248602867126, "sampling/importance_sampling_ratio/max": 2.5328028202056885, "sampling/importance_sampling_ratio/mean": 0.6831609606742859, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1976690292358398, "sampling/sampling_logp_difference/mean": 0.01645776815712452, "step": 74, "step_time": 30.296567565063015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 676.5535888671875, "completions/mean_terminated_length": 378.4130554199219, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.881963811814785, "epoch": 0.07015902712815715, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.7279005646705627, "kl": 0.02499164454638958, "learning_rate": 9.30776426566885e-06, "loss": 0.1758, "num_tokens": 12843365.0, "reward": 0.5714285969734192, "reward_std": 0.49709582328796387, "rewards/math_reward/mean": 0.5714285969734192, "rewards/math_reward/std": 0.4970957934856415, "sampling/importance_sampling_ratio/max": 2.733616352081299, "sampling/importance_sampling_ratio/mean": 0.9129493832588196, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4903218746185303, "sampling/sampling_logp_difference/mean": 0.014848687686026096, "step": 75, "step_time": 29.421985660213977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 599.6607666015625, "completions/mean_terminated_length": 340.4842224121094, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.8478939980268478, "epoch": 0.07109448082319925, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.34044405817985535, "kl": 0.02551961550489068, "learning_rate": 9.29840972871843e-06, "loss": 0.025, "num_tokens": 12992375.0, "reward": 0.8392857313156128, "reward_std": 0.368917852640152, "rewards/math_reward/mean": 0.8392857313156128, "rewards/math_reward/std": 0.368917852640152, "sampling/importance_sampling_ratio/max": 2.6362719535827637, "sampling/importance_sampling_ratio/mean": 0.8332818150520325, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4909358024597168, "sampling/sampling_logp_difference/mean": 0.01535120327025652, "step": 76, "step_time": 28.71709197992459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 721.4017944335938, "completions/mean_terminated_length": 415.26373291015625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.8823528289794922, "epoch": 0.07202993451824134, "frac_reward_zero_std": 0.5, "grad_norm": 0.419262170791626, "kl": 0.024826692417263985, "learning_rate": 9.289055191768008e-06, "loss": 0.1554, "num_tokens": 13155884.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.1652934551239014, "sampling/importance_sampling_ratio/mean": 0.7766127586364746, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49439287185668945, "sampling/sampling_logp_difference/mean": 0.01525493897497654, "step": 77, "step_time": 30.37947072600946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 756.9642944335938, "completions/mean_terminated_length": 423.3258361816406, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.89890256524086, "epoch": 0.07296538821328344, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.3526546061038971, "kl": 0.023157723248004913, "learning_rate": 9.279700654817587e-06, "loss": 0.047, "num_tokens": 13320552.0, "reward": 0.6339285969734192, "reward_std": 0.4838944375514984, "rewards/math_reward/mean": 0.6339285969734192, "rewards/math_reward/std": 0.483894407749176, "sampling/importance_sampling_ratio/max": 2.8303377628326416, "sampling/importance_sampling_ratio/mean": 0.7735016942024231, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4833714962005615, "sampling/sampling_logp_difference/mean": 0.01424524374306202, "step": 78, "step_time": 30.639315341133624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 797.9285888671875, "completions/mean_terminated_length": 492.3555603027344, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.7688489109277725, "epoch": 0.07390084190832553, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.4364739954471588, "kl": 0.02500696899369359, "learning_rate": 9.270346117867166e-06, "loss": 0.1322, "num_tokens": 13492712.0, "reward": 0.660714328289032, "reward_std": 0.4755948781967163, "rewards/math_reward/mean": 0.6607142686843872, "rewards/math_reward/std": 0.4755948781967163, "sampling/importance_sampling_ratio/max": 2.5124411582946777, "sampling/importance_sampling_ratio/mean": 0.8315368294715881, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6844038963317871, "sampling/sampling_logp_difference/mean": 0.014343070797622204, "step": 79, "step_time": 31.466018202714622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 777.8035888671875, "completions/mean_terminated_length": 484.68133544921875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.714550070464611, "epoch": 0.07483629560336763, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.31674137711524963, "kl": 0.02259710431098938, "learning_rate": 9.260991580916746e-06, "loss": -0.0086, "num_tokens": 13663642.0, "reward": 0.5267857313156128, "reward_std": 0.5015259981155396, "rewards/math_reward/mean": 0.5267857313156128, "rewards/math_reward/std": 0.5015259385108948, "sampling/importance_sampling_ratio/max": 2.9453463554382324, "sampling/importance_sampling_ratio/mean": 0.8781587481498718, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4851992130279541, "sampling/sampling_logp_difference/mean": 0.013700034469366074, "step": 80, "step_time": 31.421180576086044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 736.5267944335938, "completions/mean_terminated_length": 517.9479370117188, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.6793666332960129, "epoch": 0.07577174929840973, "frac_reward_zero_std": 0.5, "grad_norm": 0.2355533242225647, "kl": 0.022849100176244974, "learning_rate": 9.251637043966323e-06, "loss": 0.0199, "num_tokens": 13824957.0, "reward": 0.5625, "reward_std": 0.4983079433441162, "rewards/math_reward/mean": 0.5625, "rewards/math_reward/std": 0.4983079433441162, "sampling/importance_sampling_ratio/max": 2.853574752807617, "sampling/importance_sampling_ratio/mean": 0.6036138534545898, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4848747253417969, "sampling/sampling_logp_difference/mean": 0.013857844285666943, "step": 81, "step_time": 30.264461544808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 620.5089721679688, "completions/mean_terminated_length": 465.03961181640625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.6451822742819786, "epoch": 0.07670720299345182, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.5992428064346313, "kl": 0.025073510594666004, "learning_rate": 9.242282507015905e-06, "loss": 0.2053, "num_tokens": 13976526.0, "reward": 0.598214328289032, "reward_std": 0.49246248602867126, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.4924624562263489, "sampling/importance_sampling_ratio/max": 2.8946008682250977, "sampling/importance_sampling_ratio/mean": 0.8260201215744019, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48146557807922363, "sampling/sampling_logp_difference/mean": 0.014194856397807598, "step": 82, "step_time": 27.960969497915357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 741.3392944335938, "completions/mean_terminated_length": 507.51580810546875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.8084597736597061, "epoch": 0.07764265668849392, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.4401065409183502, "kl": 0.026263018138706684, "learning_rate": 9.232927970065482e-06, "loss": 0.1349, "num_tokens": 14140380.0, "reward": 0.4732142984867096, "reward_std": 0.5015259981155396, "rewards/math_reward/mean": 0.4732142984867096, "rewards/math_reward/std": 0.5015259981155396, "sampling/importance_sampling_ratio/max": 2.5734527111053467, "sampling/importance_sampling_ratio/mean": 0.7767904996871948, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4940524101257324, "sampling/sampling_logp_difference/mean": 0.013923908583819866, "step": 83, "step_time": 30.39885660307482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 760.4375610351562, "completions/mean_terminated_length": 480.5326232910156, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.8274661004543304, "epoch": 0.07857811038353602, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.47246453166007996, "kl": 0.024034636095166206, "learning_rate": 9.223573433115062e-06, "loss": 0.0887, "num_tokens": 14306277.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571844935417175, "sampling/importance_sampling_ratio/max": 2.9402225017547607, "sampling/importance_sampling_ratio/mean": 0.8122678399085999, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.564521312713623, "sampling/sampling_logp_difference/mean": 0.014668033458292484, "step": 84, "step_time": 30.78923225705512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 624.482177734375, "completions/mean_terminated_length": 453.6600036621094, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.6183740273118019, "epoch": 0.07951356407857811, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.30264219641685486, "kl": 0.023532499559223652, "learning_rate": 9.214218896164641e-06, "loss": 0.0754, "num_tokens": 14456315.0, "reward": 0.7410714626312256, "reward_std": 0.4400150775909424, "rewards/math_reward/mean": 0.7410714030265808, "rewards/math_reward/std": 0.44001504778862, "sampling/importance_sampling_ratio/max": 2.8970091342926025, "sampling/importance_sampling_ratio/mean": 0.8169851303100586, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7029047012329102, "sampling/sampling_logp_difference/mean": 0.01274169608950615, "step": 85, "step_time": 28.468509156024083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 724.3303833007812, "completions/mean_terminated_length": 453.9032287597656, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.527066208422184, "epoch": 0.08044901777362021, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.329982727766037, "kl": 0.020950476173311472, "learning_rate": 9.20486435921422e-06, "loss": 0.0897, "num_tokens": 14618576.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571844935417175, "sampling/importance_sampling_ratio/max": 2.8168132305145264, "sampling/importance_sampling_ratio/mean": 0.7676630020141602, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49971961975097656, "sampling/sampling_logp_difference/mean": 0.011643286794424057, "step": 86, "step_time": 31.758362980093807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.133928582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 631.7142944335938, "completions/mean_terminated_length": 412.7010192871094, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.6337912827730179, "epoch": 0.0813844714686623, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.350423127412796, "kl": 0.02736091846600175, "learning_rate": 9.195509822263798e-06, "loss": 0.0263, "num_tokens": 14772128.0, "reward": 0.5892857313156128, "reward_std": 0.4941745698451996, "rewards/math_reward/mean": 0.5892857313156128, "rewards/math_reward/std": 0.4941745698451996, "sampling/importance_sampling_ratio/max": 2.6372015476226807, "sampling/importance_sampling_ratio/mean": 0.7375406622886658, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.016364574432373, "sampling/sampling_logp_difference/mean": 0.01377321220934391, "step": 87, "step_time": 28.854622235987335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 753.9642944335938, "completions/mean_terminated_length": 455.3406677246094, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.6310190632939339, "epoch": 0.0823199251637044, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.31613779067993164, "kl": 0.024178009014576674, "learning_rate": 9.186155285313377e-06, "loss": 0.0082, "num_tokens": 14938380.0, "reward": 0.6071428656578064, "reward_std": 0.4905804991722107, "rewards/math_reward/mean": 0.6071428656578064, "rewards/math_reward/std": 0.4905804991722107, "sampling/importance_sampling_ratio/max": 2.9498720169067383, "sampling/importance_sampling_ratio/mean": 0.7452073097229004, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7672164440155029, "sampling/sampling_logp_difference/mean": 0.012241823598742485, "step": 88, "step_time": 30.635071597993374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 651.1428833007812, "completions/mean_terminated_length": 451.5918273925781, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.8439919352531433, "epoch": 0.0832553788587465, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.3692947328090668, "kl": 0.023488550446927547, "learning_rate": 9.176800748362957e-06, "loss": -0.0388, "num_tokens": 15095756.0, "reward": 0.535714328289032, "reward_std": 0.500964343547821, "rewards/math_reward/mean": 0.5357142686843872, "rewards/math_reward/std": 0.5009642839431763, "sampling/importance_sampling_ratio/max": 2.630074977874756, "sampling/importance_sampling_ratio/mean": 0.7284424901008606, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4913210868835449, "sampling/sampling_logp_difference/mean": 0.01553274504840374, "step": 89, "step_time": 28.897316553164274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 640.25, "completions/mean_terminated_length": 405.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.7016375586390495, "epoch": 0.08419083255378859, "frac_reward_zero_std": 0.5, "grad_norm": 0.31923767924308777, "kl": 0.024634855333715677, "learning_rate": 9.167446211412536e-06, "loss": 0.0393, "num_tokens": 15250120.0, "reward": 0.660714328289032, "reward_std": 0.4755948781967163, "rewards/math_reward/mean": 0.6607142686843872, "rewards/math_reward/std": 0.4755948483943939, "sampling/importance_sampling_ratio/max": 2.880533456802368, "sampling/importance_sampling_ratio/mean": 0.8277389407157898, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8547272682189941, "sampling/sampling_logp_difference/mean": 0.013528568670153618, "step": 90, "step_time": 28.686227132100612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 817.0357666015625, "completions/mean_terminated_length": 532.967041015625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.6025129035115242, "epoch": 0.08512628624883069, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.20962831377983093, "kl": 0.025089120026677847, "learning_rate": 9.158091674462115e-06, "loss": 0.0236, "num_tokens": 15422908.0, "reward": 0.5625, "reward_std": 0.4983079433441162, "rewards/math_reward/mean": 0.5625, "rewards/math_reward/std": 0.4983079433441162, "sampling/importance_sampling_ratio/max": 2.3816800117492676, "sampling/importance_sampling_ratio/mean": 0.6875706911087036, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49576520919799805, "sampling/sampling_logp_difference/mean": 0.011876057833433151, "step": 91, "step_time": 31.014363660709932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 642.169677734375, "completions/mean_terminated_length": 372.96807861328125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.9310932755470276, "epoch": 0.08606173994387278, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.4275594651699066, "kl": 0.027558551635593176, "learning_rate": 9.148737137511695e-06, "loss": 0.0796, "num_tokens": 15579039.0, "reward": 0.5267857313156128, "reward_std": 0.5015259981155396, "rewards/math_reward/mean": 0.5267857313156128, "rewards/math_reward/std": 0.5015259385108948, "sampling/importance_sampling_ratio/max": 2.8182361125946045, "sampling/importance_sampling_ratio/mean": 0.7539399266242981, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.492159366607666, "sampling/sampling_logp_difference/mean": 0.017420215532183647, "step": 92, "step_time": 29.228749696165323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 727.6517944335938, "completions/mean_terminated_length": 474.8191223144531, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.6072985455393791, "epoch": 0.08699719363891488, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.4719700515270233, "kl": 0.023732822854071856, "learning_rate": 9.139382600561273e-06, "loss": 0.0562, "num_tokens": 15740152.0, "reward": 0.598214328289032, "reward_std": 0.49246251583099365, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.49246248602867126, "sampling/importance_sampling_ratio/max": 2.939267158508301, "sampling/importance_sampling_ratio/mean": 0.7655865550041199, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9209880828857422, "sampling/sampling_logp_difference/mean": 0.013610387220978737, "step": 93, "step_time": 30.17977533210069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 736.8035888671875, "completions/mean_terminated_length": 502.16845703125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.5395463109016418, "epoch": 0.08793264733395698, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.32450443506240845, "kl": 0.020979616325348616, "learning_rate": 9.130028063610852e-06, "loss": 0.1082, "num_tokens": 15903410.0, "reward": 0.5535714626312256, "reward_std": 0.49935609102249146, "rewards/math_reward/mean": 0.5535714030265808, "rewards/math_reward/std": 0.49935609102249146, "sampling/importance_sampling_ratio/max": 2.6613516807556152, "sampling/importance_sampling_ratio/mean": 0.7611078023910522, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4902224540710449, "sampling/sampling_logp_difference/mean": 0.010785216465592384, "step": 94, "step_time": 30.37392402300611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.133928582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 612.2589721679688, "completions/mean_terminated_length": 390.2370910644531, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 0.8624311983585358, "epoch": 0.08886810102899906, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.37286874651908875, "kl": 0.023891277611255646, "learning_rate": 9.120673526660431e-06, "loss": -0.0326, "num_tokens": 16050495.0, "reward": 0.7500000596046448, "reward_std": 0.43495887517929077, "rewards/math_reward/mean": 0.75, "rewards/math_reward/std": 0.4349588453769684, "sampling/importance_sampling_ratio/max": 2.907604455947876, "sampling/importance_sampling_ratio/mean": 0.8327441811561584, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4946894645690918, "sampling/sampling_logp_difference/mean": 0.014705377630889416, "step": 95, "step_time": 28.17714851698838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 643.2053833007812, "completions/mean_terminated_length": 474.6299743652344, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.5961093381047249, "epoch": 0.08980355472404115, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.5040040016174316, "kl": 0.020656160544604063, "learning_rate": 9.111318989710009e-06, "loss": 0.1675, "num_tokens": 16212838.0, "reward": 0.7500000596046448, "reward_std": 0.4349588453769684, "rewards/math_reward/mean": 0.75, "rewards/math_reward/std": 0.4349588453769684, "sampling/importance_sampling_ratio/max": 2.7731642723083496, "sampling/importance_sampling_ratio/mean": 0.850098192691803, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46135711669921875, "sampling/sampling_logp_difference/mean": 0.013852788135409355, "step": 96, "step_time": 29.360433152876794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 756.0892944335938, "completions/mean_terminated_length": 440.2889099121094, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 0.810070663690567, "epoch": 0.09073900841908325, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.35011982917785645, "kl": 0.02223933907225728, "learning_rate": 9.10196445275959e-06, "loss": 0.0433, "num_tokens": 16378104.0, "reward": 0.598214328289032, "reward_std": 0.49246248602867126, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.4924624562263489, "sampling/importance_sampling_ratio/max": 2.2871720790863037, "sampling/importance_sampling_ratio/mean": 0.7087889909744263, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4774353504180908, "sampling/sampling_logp_difference/mean": 0.01463357824832201, "step": 97, "step_time": 30.288517085835338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 732.232177734375, "completions/mean_terminated_length": 410.6000061035156, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.8562614172697067, "epoch": 0.09167446211412535, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.2775327265262604, "kl": 0.024398899637162685, "learning_rate": 9.09260991580917e-06, "loss": -0.0576, "num_tokens": 16542490.0, "reward": 0.6339285969734192, "reward_std": 0.4838944375514984, "rewards/math_reward/mean": 0.6339285969734192, "rewards/math_reward/std": 0.483894407749176, "sampling/importance_sampling_ratio/max": 2.7911064624786377, "sampling/importance_sampling_ratio/mean": 0.8616927266120911, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.45514917373657227, "sampling/sampling_logp_difference/mean": 0.015032831579446793, "step": 98, "step_time": 30.220507632941008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 770.169677734375, "completions/mean_terminated_length": 541.5053100585938, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.6977381855249405, "epoch": 0.09260991580916744, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.24293072521686554, "kl": 0.01973670069128275, "learning_rate": 9.083255378858747e-06, "loss": -0.0242, "num_tokens": 16705293.0, "reward": 0.6339285969734192, "reward_std": 0.4838944375514984, "rewards/math_reward/mean": 0.6339285969734192, "rewards/math_reward/std": 0.483894407749176, "sampling/importance_sampling_ratio/max": 2.938253402709961, "sampling/importance_sampling_ratio/mean": 0.8178620934486389, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49120092391967773, "sampling/sampling_logp_difference/mean": 0.012250206433236599, "step": 99, "step_time": 30.808119937079027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 721.6160888671875, "completions/mean_terminated_length": 415.5274963378906, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.7117512226104736, "epoch": 0.09354536950420954, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.30413252115249634, "kl": 0.02714451774954796, "learning_rate": 9.073900841908326e-06, "loss": 0.0147, "num_tokens": 16866658.0, "reward": 0.7321428656578064, "reward_std": 0.44483304023742676, "rewards/math_reward/mean": 0.7321428656578064, "rewards/math_reward/std": 0.44483304023742676, "sampling/importance_sampling_ratio/max": 2.8232192993164062, "sampling/importance_sampling_ratio/mean": 0.8134698271751404, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49734973907470703, "sampling/sampling_logp_difference/mean": 0.01406124234199524, "step": 100, "step_time": 29.688108589267358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 554.0892944335938, "completions/mean_terminated_length": 374.8199768066406, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.6631470173597336, "epoch": 0.09448082319925163, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.4670555591583252, "kl": 0.023358399979770184, "learning_rate": 9.064546304957906e-06, "loss": 0.0439, "num_tokens": 17012596.0, "reward": 0.7142857313156128, "reward_std": 0.453784316778183, "rewards/math_reward/mean": 0.7142857313156128, "rewards/math_reward/std": 0.4537842869758606, "sampling/importance_sampling_ratio/max": 2.4758520126342773, "sampling/importance_sampling_ratio/mean": 0.8785476088523865, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49544286727905273, "sampling/sampling_logp_difference/mean": 0.013870622962713242, "step": 101, "step_time": 27.7277418281883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 756.044677734375, "completions/mean_terminated_length": 492.0967712402344, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.753823459148407, "epoch": 0.09541627689429373, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.274311363697052, "kl": 0.02364560915157199, "learning_rate": 9.055191768007483e-06, "loss": 0.0051, "num_tokens": 17178353.0, "reward": 0.4910714626312256, "reward_std": 0.5021671056747437, "rewards/math_reward/mean": 0.4910714328289032, "rewards/math_reward/std": 0.5021671056747437, "sampling/importance_sampling_ratio/max": 2.020312547683716, "sampling/importance_sampling_ratio/mean": 0.669426441192627, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5882487297058105, "sampling/sampling_logp_difference/mean": 0.013723825104534626, "step": 102, "step_time": 30.495858160080388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 890.3660888671875, "completions/mean_terminated_length": 557.712646484375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.9650271683931351, "epoch": 0.09635173058933583, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.293904185295105, "kl": 0.023473017383366823, "learning_rate": 9.045837231057063e-06, "loss": 0.0547, "num_tokens": 17356042.0, "reward": 0.4910714626312256, "reward_std": 0.5021671652793884, "rewards/math_reward/mean": 0.4910714328289032, "rewards/math_reward/std": 0.5021671056747437, "sampling/importance_sampling_ratio/max": 2.954164743423462, "sampling/importance_sampling_ratio/mean": 0.7522269487380981, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49136781692504883, "sampling/sampling_logp_difference/mean": 0.014925838448107243, "step": 103, "step_time": 32.99379833508283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 703.6160888671875, "completions/mean_terminated_length": 446.18084716796875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.7805739045143127, "epoch": 0.09728718428437792, "frac_reward_zero_std": 0.5, "grad_norm": 0.325752854347229, "kl": 0.0221687825396657, "learning_rate": 9.036482694106642e-06, "loss": -0.0458, "num_tokens": 17516303.0, "reward": 0.660714328289032, "reward_std": 0.4755948781967163, "rewards/math_reward/mean": 0.6607142686843872, "rewards/math_reward/std": 0.4755948781967163, "sampling/importance_sampling_ratio/max": 2.2403652667999268, "sampling/importance_sampling_ratio/mean": 0.78910893201828, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47359180450439453, "sampling/sampling_logp_difference/mean": 0.012957045808434486, "step": 104, "step_time": 34.662410606397316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 8.281728696601931e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.281728696601931e-05, "completions/clipped_ratio": 0.267857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 867.2500610351562, "completions/mean_terminated_length": 435.2682800292969, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.8600903749465942, "epoch": 0.09822263797942002, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.3531532883644104, "kl": 0.02397845033556223, "learning_rate": 9.027128157156222e-06, "loss": 0.1088, "num_tokens": 17689987.0, "reward": 0.535714328289032, "reward_std": 0.500964343547821, "rewards/math_reward/mean": 0.5357142686843872, "rewards/math_reward/std": 0.5009642839431763, "sampling/importance_sampling_ratio/max": 2.9771804809570312, "sampling/importance_sampling_ratio/mean": 0.7390323877334595, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7029304504394531, "sampling/sampling_logp_difference/mean": 0.014603538438677788, "step": 105, "step_time": 33.835389300016686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 816.0357666015625, "completions/mean_terminated_length": 514.888916015625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.8621076941490173, "epoch": 0.09915809167446211, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.2480567842721939, "kl": 0.022621055133640766, "learning_rate": 9.017773620205801e-06, "loss": -0.0445, "num_tokens": 17866543.0, "reward": 0.5267857313156128, "reward_std": 0.5015259981155396, "rewards/math_reward/mean": 0.5267857313156128, "rewards/math_reward/std": 0.5015259385108948, "sampling/importance_sampling_ratio/max": 2.7118401527404785, "sampling/importance_sampling_ratio/mean": 0.6742488741874695, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6126499176025391, "sampling/sampling_logp_difference/mean": 0.014763915911316872, "step": 106, "step_time": 33.66475151665509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 786.0535888671875, "completions/mean_terminated_length": 477.5777893066406, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.640127032995224, "epoch": 0.10009354536950421, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.24037525057792664, "kl": 0.02059887908399105, "learning_rate": 9.00841908325538e-06, "loss": 0.0005, "num_tokens": 18034365.0, "reward": 0.8035714626312256, "reward_std": 0.3990819752216339, "rewards/math_reward/mean": 0.8035714030265808, "rewards/math_reward/std": 0.3990819752216339, "sampling/importance_sampling_ratio/max": 2.8767940998077393, "sampling/importance_sampling_ratio/mean": 0.8309966921806335, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47564125061035156, "sampling/sampling_logp_difference/mean": 0.011786062270402908, "step": 107, "step_time": 30.93794283317402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 697.9464721679688, "completions/mean_terminated_length": 456.35791015625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.5882241651415825, "epoch": 0.10102899906454631, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.5053687691688538, "kl": 0.021651040762662888, "learning_rate": 8.999064546304958e-06, "loss": 0.195, "num_tokens": 18194871.0, "reward": 0.6785714626312256, "reward_std": 0.46912387013435364, "rewards/math_reward/mean": 0.6785714030265808, "rewards/math_reward/std": 0.46912387013435364, "sampling/importance_sampling_ratio/max": 2.5042991638183594, "sampling/importance_sampling_ratio/mean": 0.8277543187141418, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4759488105773926, "sampling/sampling_logp_difference/mean": 0.01140574924647808, "step": 108, "step_time": 29.60721989488229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 820.2767944335938, "completions/mean_terminated_length": 467.4827575683594, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.6075391694903374, "epoch": 0.1019644527595884, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.27292028069496155, "kl": 0.021350522991269827, "learning_rate": 8.989710009354537e-06, "loss": -0.0615, "num_tokens": 18362526.0, "reward": 0.4821428656578064, "reward_std": 0.5019267797470093, "rewards/math_reward/mean": 0.4821428656578064, "rewards/math_reward/std": 0.5019267797470093, "sampling/importance_sampling_ratio/max": 2.7917160987854004, "sampling/importance_sampling_ratio/mean": 0.7364131212234497, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4950428009033203, "sampling/sampling_logp_difference/mean": 0.011695979163050652, "step": 109, "step_time": 31.39874632516876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 942.232177734375, "completions/mean_terminated_length": 499.9250183105469, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.8173087239265442, "epoch": 0.1028999064546305, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.27261027693748474, "kl": 0.021475217305123806, "learning_rate": 8.980355472404117e-06, "loss": -0.0534, "num_tokens": 18549536.0, "reward": 0.660714328289032, "reward_std": 0.4755948781967163, "rewards/math_reward/mean": 0.6607142686843872, "rewards/math_reward/std": 0.4755948781967163, "sampling/importance_sampling_ratio/max": 2.8775486946105957, "sampling/importance_sampling_ratio/mean": 0.8223318457603455, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5720715522766113, "sampling/sampling_logp_difference/mean": 0.01264828722923994, "step": 110, "step_time": 35.28679958009161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 651.1964721679688, "completions/mean_terminated_length": 365.82794189453125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.6552959829568863, "epoch": 0.1038353601496726, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3876038193702698, "kl": 0.0216083824634552, "learning_rate": 8.971000935453696e-06, "loss": -0.1125, "num_tokens": 18698214.0, "reward": 0.7142857313156128, "reward_std": 0.4537842869758606, "rewards/math_reward/mean": 0.7142857313156128, "rewards/math_reward/std": 0.4537842869758606, "sampling/importance_sampling_ratio/max": 2.8217661380767822, "sampling/importance_sampling_ratio/mean": 0.8658309578895569, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46915584802627563, "sampling/sampling_logp_difference/mean": 0.012713463976979256, "step": 111, "step_time": 28.880119456211105 }, { "clip_ratio/high_max": 0.0003143736466881819, "clip_ratio/high_mean": 4.491052277444396e-05, "clip_ratio/low_mean": 4.102732418687083e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.593784696131479e-05, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 800.1517944335938, "completions/mean_terminated_length": 422.8953552246094, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.8356106877326965, "epoch": 0.10477081384471469, "frac_reward_zero_std": 0.5, "grad_norm": 0.2794932723045349, "kl": 0.02213257271796465, "learning_rate": 8.961646398503275e-06, "loss": 0.1851, "num_tokens": 18869319.0, "reward": 0.7321428656578064, "reward_std": 0.44483307003974915, "rewards/math_reward/mean": 0.7321428656578064, "rewards/math_reward/std": 0.44483304023742676, "sampling/importance_sampling_ratio/max": 2.9693264961242676, "sampling/importance_sampling_ratio/mean": 0.8001471757888794, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47028589248657227, "sampling/sampling_logp_difference/mean": 0.011985650286078453, "step": 112, "step_time": 31.632992930011824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 808.1160888671875, "completions/mean_terminated_length": 433.2674255371094, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.5987582132220268, "epoch": 0.10570626753975679, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3246399462223053, "kl": 0.02264844113960862, "learning_rate": 8.952291861552855e-06, "loss": -0.0484, "num_tokens": 19044060.0, "reward": 0.6160714626312256, "reward_std": 0.48852667212486267, "rewards/math_reward/mean": 0.6160714030265808, "rewards/math_reward/std": 0.4885266423225403, "sampling/importance_sampling_ratio/max": 2.6680288314819336, "sampling/importance_sampling_ratio/mean": 0.8019680976867676, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6216341257095337, "sampling/sampling_logp_difference/mean": 0.011696546338498592, "step": 113, "step_time": 31.905618467135355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 815.9464721679688, "completions/mean_terminated_length": 443.4651184082031, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5933007150888443, "epoch": 0.10664172123479888, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.26487869024276733, "kl": 0.024494424927979708, "learning_rate": 8.942937324602433e-06, "loss": -0.0254, "num_tokens": 19217782.0, "reward": 0.6428571939468384, "reward_std": 0.48131096363067627, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.48131096363067627, "sampling/importance_sampling_ratio/max": 2.436768054962158, "sampling/importance_sampling_ratio/mean": 0.7889321446418762, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5268858075141907, "sampling/sampling_logp_difference/mean": 0.01221440825611353, "step": 114, "step_time": 32.66105648688972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 831.1428833007812, "completions/mean_terminated_length": 463.25579833984375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.7983029335737228, "epoch": 0.10757717492984098, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3057413697242737, "kl": 0.02465883083641529, "learning_rate": 8.933582787652012e-06, "loss": -0.026, "num_tokens": 19397214.0, "reward": 0.6428571939468384, "reward_std": 0.4813109338283539, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.48131096363067627, "sampling/importance_sampling_ratio/max": 2.8666181564331055, "sampling/importance_sampling_ratio/mean": 0.7686472535133362, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49730348587036133, "sampling/sampling_logp_difference/mean": 0.014569006860256195, "step": 115, "step_time": 33.04371024342254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 783.8214721679688, "completions/mean_terminated_length": 382.2588195800781, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.682896114885807, "epoch": 0.10851262862488306, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.26609504222869873, "kl": 0.02349651837721467, "learning_rate": 8.924228250701591e-06, "loss": 0.0109, "num_tokens": 19565610.0, "reward": 0.6875000596046448, "reward_std": 0.46559563279151917, "rewards/math_reward/mean": 0.6875, "rewards/math_reward/std": 0.4655956029891968, "sampling/importance_sampling_ratio/max": 2.1770365238189697, "sampling/importance_sampling_ratio/mean": 0.7038411498069763, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.44570302963256836, "sampling/sampling_logp_difference/mean": 0.012808958999812603, "step": 116, "step_time": 31.112071285955608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 878.3839721679688, "completions/mean_terminated_length": 469.7228698730469, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.7489656209945679, "epoch": 0.10944808231992516, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.28226375579833984, "kl": 0.023425099439918995, "learning_rate": 8.91487371375117e-06, "loss": 0.0589, "num_tokens": 19746077.0, "reward": 0.7500000596046448, "reward_std": 0.4349588453769684, "rewards/math_reward/mean": 0.75, "rewards/math_reward/std": 0.4349588453769684, "sampling/importance_sampling_ratio/max": 2.8551571369171143, "sampling/importance_sampling_ratio/mean": 0.7795648574829102, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4914402961730957, "sampling/sampling_logp_difference/mean": 0.013223295100033283, "step": 117, "step_time": 34.05552626796998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 939.6428833007812, "completions/mean_terminated_length": 435.8441467285156, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 1.0271970927715302, "epoch": 0.11038353601496725, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.4698631167411804, "kl": 0.024006573017686605, "learning_rate": 8.905519176800748e-06, "loss": 0.0942, "num_tokens": 19932357.0, "reward": 0.6428571939468384, "reward_std": 0.48131096363067627, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.48131096363067627, "sampling/importance_sampling_ratio/max": 2.7074899673461914, "sampling/importance_sampling_ratio/mean": 0.7092493176460266, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7338097095489502, "sampling/sampling_logp_difference/mean": 0.01605825684964657, "step": 118, "step_time": 35.050098618958145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 809.8303833007812, "completions/mean_terminated_length": 472.1477355957031, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.5908288136124611, "epoch": 0.11131898971000935, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.4932723045349121, "kl": 0.021413424517959356, "learning_rate": 8.896164639850328e-06, "loss": 0.1662, "num_tokens": 20105034.0, "reward": 0.6875000596046448, "reward_std": 0.46559563279151917, "rewards/math_reward/mean": 0.6875, "rewards/math_reward/std": 0.4655956029891968, "sampling/importance_sampling_ratio/max": 2.711989164352417, "sampling/importance_sampling_ratio/mean": 0.8167208433151245, "sampling/importance_sampling_ratio/min": 0.000475635431939736, "sampling/sampling_logp_difference/max": 0.47264397144317627, "sampling/sampling_logp_difference/mean": 0.011726352386176586, "step": 119, "step_time": 32.017697176197544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1069.46435546875, "completions/mean_terminated_length": 624.6753540039062, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.682678833603859, "epoch": 0.11225444340505145, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.21770359575748444, "kl": 0.023754436057060957, "learning_rate": 8.886810102899907e-06, "loss": 0.029, "num_tokens": 20309390.0, "reward": 0.4196428656578064, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.4196428656578064, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.842855930328369, "sampling/importance_sampling_ratio/mean": 0.7591627836227417, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4969944953918457, "sampling/sampling_logp_difference/mean": 0.013269728049635887, "step": 120, "step_time": 39.44302791589871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 717.6517944335938, "completions/mean_terminated_length": 479.5895080566406, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.7031509578227997, "epoch": 0.11318989710009354, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.4828949570655823, "kl": 0.026314132381230593, "learning_rate": 8.877455565949486e-06, "loss": 0.0619, "num_tokens": 20468527.0, "reward": 0.6339285969734192, "reward_std": 0.4838944375514984, "rewards/math_reward/mean": 0.6339285969734192, "rewards/math_reward/std": 0.483894407749176, "sampling/importance_sampling_ratio/max": 2.8452789783477783, "sampling/importance_sampling_ratio/mean": 0.8095670342445374, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48099803924560547, "sampling/sampling_logp_difference/mean": 0.013460630550980568, "step": 121, "step_time": 30.504324027104303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 731.1250610351562, "completions/mean_terminated_length": 444.84783935546875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.7984932363033295, "epoch": 0.11412535079513564, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.4219631850719452, "kl": 0.024295355658978224, "learning_rate": 8.868101028999066e-06, "loss": 0.1, "num_tokens": 20632325.0, "reward": 0.5892857313156128, "reward_std": 0.4941745698451996, "rewards/math_reward/mean": 0.5892857313156128, "rewards/math_reward/std": 0.4941745698451996, "sampling/importance_sampling_ratio/max": 2.9066410064697266, "sampling/importance_sampling_ratio/mean": 0.8347505927085876, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47962427139282227, "sampling/sampling_logp_difference/mean": 0.015229911543428898, "step": 122, "step_time": 30.204741147113964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 701.9107666015625, "completions/mean_terminated_length": 391.2747497558594, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.778212770819664, "epoch": 0.11506080449017773, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.9994753003120422, "kl": 0.02520068921148777, "learning_rate": 8.858746492048645e-06, "loss": 0.3097, "num_tokens": 20792499.0, "reward": 0.7142857313156128, "reward_std": 0.45378434658050537, "rewards/math_reward/mean": 0.7142857313156128, "rewards/math_reward/std": 0.4537842869758606, "sampling/importance_sampling_ratio/max": 2.995610237121582, "sampling/importance_sampling_ratio/mean": 0.8877363204956055, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47868919372558594, "sampling/sampling_logp_difference/mean": 0.015339827165007591, "step": 123, "step_time": 30.21161715290509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 550.2232666015625, "completions/mean_terminated_length": 336.2550964355469, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.8416875898838043, "epoch": 0.11599625818521983, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.3751614987850189, "kl": 0.02651203190907836, "learning_rate": 8.849391955098223e-06, "loss": 0.0069, "num_tokens": 20940276.0, "reward": 0.7946428656578064, "reward_std": 0.4057779014110565, "rewards/math_reward/mean": 0.7946428656578064, "rewards/math_reward/std": 0.4057779014110565, "sampling/importance_sampling_ratio/max": 2.6515471935272217, "sampling/importance_sampling_ratio/mean": 0.7579895257949829, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47403907775878906, "sampling/sampling_logp_difference/mean": 0.015350197441875935, "step": 124, "step_time": 27.825599097879604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 756.169677734375, "completions/mean_terminated_length": 458.0549621582031, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.7440682798624039, "epoch": 0.11693171188026193, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.30048656463623047, "kl": 0.025277110282331705, "learning_rate": 8.840037418147802e-06, "loss": 0.0221, "num_tokens": 21108511.0, "reward": 0.598214328289032, "reward_std": 0.49246251583099365, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.49246248602867126, "sampling/importance_sampling_ratio/max": 2.700679063796997, "sampling/importance_sampling_ratio/mean": 0.7780976295471191, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.494326114654541, "sampling/sampling_logp_difference/mean": 0.013498984277248383, "step": 125, "step_time": 30.73234811797738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1704.0, "completions/mean_length": 776.4910888671875, "completions/mean_terminated_length": 429.7159118652344, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.9491439163684845, "epoch": 0.11786716557530402, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.19341996312141418, "kl": 0.025042998604476452, "learning_rate": 8.830682881197382e-06, "loss": -0.0314, "num_tokens": 21273318.0, "reward": 0.6071428656578064, "reward_std": 0.4905804991722107, "rewards/math_reward/mean": 0.6071428656578064, "rewards/math_reward/std": 0.4905804991722107, "sampling/importance_sampling_ratio/max": 2.9313366413116455, "sampling/importance_sampling_ratio/mean": 0.7630738019943237, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7422018051147461, "sampling/sampling_logp_difference/mean": 0.01593460887670517, "step": 126, "step_time": 31.001274168957025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 630.3392944335938, "completions/mean_terminated_length": 427.8163146972656, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.6412579119205475, "epoch": 0.11880261927034612, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3463010787963867, "kl": 0.024864980950951576, "learning_rate": 8.821328344246961e-06, "loss": 0.081, "num_tokens": 21428252.0, "reward": 0.7500000596046448, "reward_std": 0.4349588453769684, "rewards/math_reward/mean": 0.75, "rewards/math_reward/std": 0.4349588453769684, "sampling/importance_sampling_ratio/max": 2.816514492034912, "sampling/importance_sampling_ratio/mean": 0.8518850207328796, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47934579849243164, "sampling/sampling_logp_difference/mean": 0.01357725728303194, "step": 127, "step_time": 28.713844551006332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 721.5178833007812, "completions/mean_terminated_length": 415.4066162109375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.650695726275444, "epoch": 0.11973807296538821, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.4204951524734497, "kl": 0.024052571039646864, "learning_rate": 8.81197380729654e-06, "loss": 0.0802, "num_tokens": 21590190.0, "reward": 0.7053571939468384, "reward_std": 0.45793095231056213, "rewards/math_reward/mean": 0.7053571343421936, "rewards/math_reward/std": 0.45793095231056213, "sampling/importance_sampling_ratio/max": 2.435624599456787, "sampling/importance_sampling_ratio/mean": 0.7791306376457214, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6384267807006836, "sampling/sampling_logp_difference/mean": 0.013753154315054417, "step": 128, "step_time": 29.70146487909369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 884.7857666015625, "completions/mean_terminated_length": 616.3516845703125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.8058381527662277, "epoch": 0.12067352666043031, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.16848164796829224, "kl": 0.02469521900638938, "learning_rate": 8.802619270346118e-06, "loss": 0.0316, "num_tokens": 21767150.0, "reward": 0.3750000298023224, "reward_std": 0.4862987697124481, "rewards/math_reward/mean": 0.375, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.9961934089660645, "sampling/importance_sampling_ratio/mean": 0.7519814372062683, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49674367904663086, "sampling/sampling_logp_difference/mean": 0.01501318160444498, "step": 129, "step_time": 32.677269246196374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 680.7053833007812, "completions/mean_terminated_length": 485.3775329589844, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.6709459945559502, "epoch": 0.1216089803554724, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.29947835206985474, "kl": 0.02549372147768736, "learning_rate": 8.793264733395697e-06, "loss": 0.0308, "num_tokens": 21919397.0, "reward": 0.6875000596046448, "reward_std": 0.46559563279151917, "rewards/math_reward/mean": 0.6875, "rewards/math_reward/std": 0.4655956029891968, "sampling/importance_sampling_ratio/max": 2.6517333984375, "sampling/importance_sampling_ratio/mean": 0.8100066781044006, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.698246955871582, "sampling/sampling_logp_difference/mean": 0.012380710802972317, "step": 130, "step_time": 28.607717521721497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 722.0892944335938, "completions/mean_terminated_length": 468.19146728515625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.8128711134195328, "epoch": 0.1225444340505145, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.24905703961849213, "kl": 0.02631789306178689, "learning_rate": 8.783910196445277e-06, "loss": -0.036, "num_tokens": 22078255.0, "reward": 0.5, "reward_std": 0.5022472143173218, "rewards/math_reward/mean": 0.5, "rewards/math_reward/std": 0.5022472143173218, "sampling/importance_sampling_ratio/max": 2.708195209503174, "sampling/importance_sampling_ratio/mean": 0.705689549446106, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4972684383392334, "sampling/sampling_logp_difference/mean": 0.015742596238851547, "step": 131, "step_time": 30.10544736427255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 565.357177734375, "completions/mean_terminated_length": 353.551025390625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.66730947047472, "epoch": 0.1234798877455566, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.61083984375, "kl": 0.02693662652745843, "learning_rate": 8.774555659494856e-06, "loss": 0.1595, "num_tokens": 22223735.0, "reward": 0.7678571939468384, "reward_std": 0.4240971803665161, "rewards/math_reward/mean": 0.7678571343421936, "rewards/math_reward/std": 0.4240972101688385, "sampling/importance_sampling_ratio/max": 2.914985179901123, "sampling/importance_sampling_ratio/mean": 0.8204894661903381, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.42850780487060547, "sampling/sampling_logp_difference/mean": 0.013472506776452065, "step": 132, "step_time": 27.765177560038865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 813.3660888671875, "completions/mean_terminated_length": 440.1046447753906, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 1.1227635890245438, "epoch": 0.1244153414405987, "frac_reward_zero_std": 0.5, "grad_norm": 0.45009154081344604, "kl": 0.02778803650289774, "learning_rate": 8.765201122544434e-06, "loss": -0.0226, "num_tokens": 22397544.0, "reward": 0.6071428656578064, "reward_std": 0.4905805289745331, "rewards/math_reward/mean": 0.6071428656578064, "rewards/math_reward/std": 0.4905805289745331, "sampling/importance_sampling_ratio/max": 2.8025963306427, "sampling/importance_sampling_ratio/mean": 0.685200572013855, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6196651458740234, "sampling/sampling_logp_difference/mean": 0.017416130751371384, "step": 133, "step_time": 32.200609928928316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 687.9910888671875, "completions/mean_terminated_length": 493.7040710449219, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.7447779923677444, "epoch": 0.1253507951356408, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.46246451139450073, "kl": 0.024268624372780323, "learning_rate": 8.755846585594013e-06, "loss": -0.0173, "num_tokens": 22553695.0, "reward": 0.598214328289032, "reward_std": 0.49246248602867126, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.4924624562263489, "sampling/importance_sampling_ratio/max": 2.8798718452453613, "sampling/importance_sampling_ratio/mean": 0.823870837688446, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4851865768432617, "sampling/sampling_logp_difference/mean": 0.01493231300264597, "step": 134, "step_time": 29.19689407828264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 763.7589721679688, "completions/mean_terminated_length": 484.57611083984375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.762785017490387, "epoch": 0.12628624883068287, "frac_reward_zero_std": 0.5, "grad_norm": 0.2890458106994629, "kl": 0.024467317387461662, "learning_rate": 8.746492048643593e-06, "loss": 0.0634, "num_tokens": 22717156.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.6234514713287354, "sampling/importance_sampling_ratio/mean": 0.8559027910232544, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5067522525787354, "sampling/sampling_logp_difference/mean": 0.014159145765006542, "step": 135, "step_time": 30.31502486509271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 751.3928833007812, "completions/mean_terminated_length": 469.5217590332031, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.793616272509098, "epoch": 0.12722170252572498, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.2684277296066284, "kl": 0.023368895519524813, "learning_rate": 8.737137511693172e-06, "loss": 0.0707, "num_tokens": 22880848.0, "reward": 0.6964285969734192, "reward_std": 0.4618662893772125, "rewards/math_reward/mean": 0.6964285969734192, "rewards/math_reward/std": 0.4618663191795349, "sampling/importance_sampling_ratio/max": 2.902927875518799, "sampling/importance_sampling_ratio/mean": 0.8181256055831909, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4848825931549072, "sampling/sampling_logp_difference/mean": 0.01483836304396391, "step": 136, "step_time": 30.56783035118133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 763.4910888671875, "completions/mean_terminated_length": 431.5393371582031, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.88212089240551, "epoch": 0.12815715622076707, "frac_reward_zero_std": 0.7142857313156128, "grad_norm": 0.1695277839899063, "kl": 0.023292419966310263, "learning_rate": 8.727782974742751e-06, "loss": -0.018, "num_tokens": 23047631.0, "reward": 0.6875000596046448, "reward_std": 0.4655956029891968, "rewards/math_reward/mean": 0.6875, "rewards/math_reward/std": 0.4655956029891968, "sampling/importance_sampling_ratio/max": 2.486185312271118, "sampling/importance_sampling_ratio/mean": 0.7903108596801758, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4681835174560547, "sampling/sampling_logp_difference/mean": 0.014175722375512123, "step": 137, "step_time": 30.72566891508177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 594.9642944335938, "completions/mean_terminated_length": 436.7128601074219, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.7430385500192642, "epoch": 0.12909260991580918, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.35884544253349304, "kl": 0.022821613121777773, "learning_rate": 8.71842843779233e-06, "loss": 0.0116, "num_tokens": 23196195.0, "reward": 0.5892857313156128, "reward_std": 0.4941745698451996, "rewards/math_reward/mean": 0.5892857313156128, "rewards/math_reward/std": 0.4941745698451996, "sampling/importance_sampling_ratio/max": 2.465674877166748, "sampling/importance_sampling_ratio/mean": 0.8803718686103821, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6249911785125732, "sampling/sampling_logp_difference/mean": 0.013913067989051342, "step": 138, "step_time": 27.902223809389398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 659.794677734375, "completions/mean_terminated_length": 461.4795837402344, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.6879129111766815, "epoch": 0.13002806361085126, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.26820334792137146, "kl": 0.02359225833788514, "learning_rate": 8.709073900841908e-06, "loss": -0.014, "num_tokens": 23351228.0, "reward": 0.5892857313156128, "reward_std": 0.4941745698451996, "rewards/math_reward/mean": 0.5892857313156128, "rewards/math_reward/std": 0.4941745698451996, "sampling/importance_sampling_ratio/max": 2.8981740474700928, "sampling/importance_sampling_ratio/mean": 0.7683890461921692, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5561957955360413, "sampling/sampling_logp_difference/mean": 0.012883106246590614, "step": 139, "step_time": 29.16174201015383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1272.0, "completions/mean_length": 507.794677734375, "completions/mean_terminated_length": 356.79412841796875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.6747871190309525, "epoch": 0.13096351730589337, "frac_reward_zero_std": 0.7142857313156128, "grad_norm": 0.23203442990779877, "kl": 0.02481255028396845, "learning_rate": 8.699719363891488e-06, "loss": 0.0115, "num_tokens": 23490493.0, "reward": 0.8035714626312256, "reward_std": 0.3990819752216339, "rewards/math_reward/mean": 0.8035714030265808, "rewards/math_reward/std": 0.3990819454193115, "sampling/importance_sampling_ratio/max": 2.9839422702789307, "sampling/importance_sampling_ratio/mean": 0.8546848893165588, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.484874963760376, "sampling/sampling_logp_difference/mean": 0.01266761776059866, "step": 140, "step_time": 26.693734363419935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 661.5267944335938, "completions/mean_terminated_length": 396.0318908691406, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.794002965092659, "epoch": 0.13189897100093545, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2995099723339081, "kl": 0.025711319409310818, "learning_rate": 8.690364826941067e-06, "loss": -0.007, "num_tokens": 23647088.0, "reward": 0.6964285969734192, "reward_std": 0.4618662893772125, "rewards/math_reward/mean": 0.6964285969734192, "rewards/math_reward/std": 0.4618663191795349, "sampling/importance_sampling_ratio/max": 2.6572060585021973, "sampling/importance_sampling_ratio/mean": 0.8513964414596558, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6671099662780762, "sampling/sampling_logp_difference/mean": 0.015284458175301552, "step": 141, "step_time": 29.437935571884736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 720.2500610351562, "completions/mean_terminated_length": 395.68890380859375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.7942268773913383, "epoch": 0.13283442469597756, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.5762100219726562, "kl": 0.029590843711048365, "learning_rate": 8.681010289990646e-06, "loss": 0.1056, "num_tokens": 23809412.0, "reward": 0.785714328289032, "reward_std": 0.41217008233070374, "rewards/math_reward/mean": 0.7857142686843872, "rewards/math_reward/std": 0.41217008233070374, "sampling/importance_sampling_ratio/max": 2.6888251304626465, "sampling/importance_sampling_ratio/mean": 0.8356027603149414, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46222448348999023, "sampling/sampling_logp_difference/mean": 0.014682424254715443, "step": 142, "step_time": 30.290219528833404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1435.0, "completions/mean_length": 779.4642944335938, "completions/mean_terminated_length": 414.9425354003906, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.7620804160833359, "epoch": 0.13376987839101964, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3490307629108429, "kl": 0.02628836315125227, "learning_rate": 8.671655753040226e-06, "loss": 0.0411, "num_tokens": 23974600.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.703996181488037, "sampling/importance_sampling_ratio/mean": 0.8362613320350647, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7150084972381592, "sampling/sampling_logp_difference/mean": 0.015256687998771667, "step": 143, "step_time": 30.782382332487032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 649.6339721679688, "completions/mean_terminated_length": 381.8616943359375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.6883799806237221, "epoch": 0.13470533208606175, "frac_reward_zero_std": 0.5, "grad_norm": 0.3070104420185089, "kl": 0.0229444345459342, "learning_rate": 8.662301216089805e-06, "loss": -0.0572, "num_tokens": 24125215.0, "reward": 0.7500000596046448, "reward_std": 0.4349588453769684, "rewards/math_reward/mean": 0.75, "rewards/math_reward/std": 0.4349588453769684, "sampling/importance_sampling_ratio/max": 2.4268336296081543, "sampling/importance_sampling_ratio/mean": 0.8118105530738831, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49834108352661133, "sampling/sampling_logp_difference/mean": 0.014046203345060349, "step": 144, "step_time": 28.989821078022942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 747.6964721679688, "completions/mean_terminated_length": 465.0217590332031, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.5689322203397751, "epoch": 0.13564078578110383, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.34640949964523315, "kl": 0.021064442582428455, "learning_rate": 8.652946679139383e-06, "loss": -0.0029, "num_tokens": 24291525.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.580817461013794, "sampling/importance_sampling_ratio/mean": 0.7706711888313293, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48464035987854004, "sampling/sampling_logp_difference/mean": 0.011080271564424038, "step": 145, "step_time": 30.937108675017953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 726.4642944335938, "completions/mean_terminated_length": 456.4731140136719, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.8364927619695663, "epoch": 0.13657623947614594, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.3264884650707245, "kl": 0.022750768344849348, "learning_rate": 8.643592142188962e-06, "loss": 0.0378, "num_tokens": 24458297.0, "reward": 0.6696428656578064, "reward_std": 0.47245559096336365, "rewards/math_reward/mean": 0.6696428656578064, "rewards/math_reward/std": 0.47245559096336365, "sampling/importance_sampling_ratio/max": 2.729137897491455, "sampling/importance_sampling_ratio/mean": 0.7587852478027344, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5247611999511719, "sampling/sampling_logp_difference/mean": 0.01514550019055605, "step": 146, "step_time": 31.05556473392062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 883.232177734375, "completions/mean_terminated_length": 513.2470703125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.8608011305332184, "epoch": 0.13751169317118803, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.33296024799346924, "kl": 0.02368284296244383, "learning_rate": 8.634237605238542e-06, "loss": 0.0533, "num_tokens": 24638307.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571844935417175, "sampling/importance_sampling_ratio/max": 2.9289112091064453, "sampling/importance_sampling_ratio/mean": 0.7879496812820435, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5643351078033447, "sampling/sampling_logp_difference/mean": 0.01419844850897789, "step": 147, "step_time": 33.52834388194606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 751.3125610351562, "completions/mean_terminated_length": 452.0769348144531, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.9147213101387024, "epoch": 0.1384471468662301, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.5819193720817566, "kl": 0.02298256941139698, "learning_rate": 8.624883068288121e-06, "loss": -0.0575, "num_tokens": 24806990.0, "reward": 0.5267857313156128, "reward_std": 0.5015259981155396, "rewards/math_reward/mean": 0.5267857313156128, "rewards/math_reward/std": 0.5015259981155396, "sampling/importance_sampling_ratio/max": 2.46181058883667, "sampling/importance_sampling_ratio/mean": 0.7933129072189331, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.41737842559814453, "sampling/sampling_logp_difference/mean": 0.01487315259873867, "step": 148, "step_time": 31.222021464956924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 717.1875610351562, "completions/mean_terminated_length": 410.0769348144531, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.7839584052562714, "epoch": 0.13938260056127222, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3397277891635895, "kl": 0.023950406350195408, "learning_rate": 8.615528531337699e-06, "loss": 0.0793, "num_tokens": 24969115.0, "reward": 0.7321428656578064, "reward_std": 0.44483304023742676, "rewards/math_reward/mean": 0.7321428656578064, "rewards/math_reward/std": 0.44483304023742676, "sampling/importance_sampling_ratio/max": 2.722607374191284, "sampling/importance_sampling_ratio/mean": 0.8329526782035828, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4941678047180176, "sampling/sampling_logp_difference/mean": 0.013004844076931477, "step": 149, "step_time": 30.20366002013907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 725.9732666015625, "completions/mean_terminated_length": 438.5760803222656, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.7784537374973297, "epoch": 0.1403180542563143, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.38030388951301575, "kl": 0.021449393592774868, "learning_rate": 8.60617399438728e-06, "loss": 0.1616, "num_tokens": 25126376.0, "reward": 0.723214328289032, "reward_std": 0.4494204819202423, "rewards/math_reward/mean": 0.7232142686843872, "rewards/math_reward/std": 0.4494204819202423, "sampling/importance_sampling_ratio/max": 2.7336087226867676, "sampling/importance_sampling_ratio/mean": 0.7323525547981262, "sampling/importance_sampling_ratio/min": 0.0044091674499213696, "sampling/sampling_logp_difference/max": 0.44256067276000977, "sampling/sampling_logp_difference/mean": 0.012723101302981377, "step": 150, "step_time": 29.611200421815738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 999.4464721679688, "completions/mean_terminated_length": 522.8311767578125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 1.0927308648824692, "epoch": 0.1412535079513564, "frac_reward_zero_std": 0.785714328289032, "grad_norm": 0.163218691945076, "kl": 0.02498358767479658, "learning_rate": 8.596819457436857e-06, "loss": 0.0234, "num_tokens": 25318970.0, "reward": 0.3839285969734192, "reward_std": 0.48852667212486267, "rewards/math_reward/mean": 0.3839285671710968, "rewards/math_reward/std": 0.48852667212486267, "sampling/importance_sampling_ratio/max": 2.5249900817871094, "sampling/importance_sampling_ratio/mean": 0.6985392570495605, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4926323890686035, "sampling/sampling_logp_difference/mean": 0.017699969932436943, "step": 151, "step_time": 36.046763660851866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 820.9464721679688, "completions/mean_terminated_length": 468.3448181152344, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.8410104513168335, "epoch": 0.1421889616463985, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3205535113811493, "kl": 0.02578172693029046, "learning_rate": 8.587464920486437e-06, "loss": 0.0728, "num_tokens": 25493316.0, "reward": 0.5714285969734192, "reward_std": 0.4970957934856415, "rewards/math_reward/mean": 0.5714285969734192, "rewards/math_reward/std": 0.4970957934856415, "sampling/importance_sampling_ratio/max": 2.1757357120513916, "sampling/importance_sampling_ratio/mean": 0.6713786721229553, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49787378311157227, "sampling/sampling_logp_difference/mean": 0.014690201729536057, "step": 152, "step_time": 31.94501313753426 }, { "clip_ratio/high_max": 0.00019816106942016631, "clip_ratio/high_mean": 2.830872472259216e-05, "clip_ratio/low_mean": 3.4471672734071035e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.27803974566632e-05, "completions/clipped_ratio": 0.2589285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 863.2678833007812, "completions/mean_terminated_length": 449.3252868652344, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 1.2511562258005142, "epoch": 0.1431244153414406, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.4459776282310486, "kl": 0.023169849533587694, "learning_rate": 8.578110383536016e-06, "loss": -0.0878, "num_tokens": 25665274.0, "reward": 0.6428571939468384, "reward_std": 0.4813109338283539, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.48131096363067627, "sampling/importance_sampling_ratio/max": 2.844818115234375, "sampling/importance_sampling_ratio/mean": 0.7367504239082336, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4992547035217285, "sampling/sampling_logp_difference/mean": 0.017328599467873573, "step": 153, "step_time": 33.384917333489284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.330357164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 985.83935546875, "completions/mean_terminated_length": 461.8399963378906, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 1.048023983836174, "epoch": 0.14405986903648269, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.30013254284858704, "kl": 0.02329267468303442, "learning_rate": 8.568755846585594e-06, "loss": -0.0092, "num_tokens": 25860120.0, "reward": 0.660714328289032, "reward_std": 0.4755948781967163, "rewards/math_reward/mean": 0.6607142686843872, "rewards/math_reward/std": 0.4755948781967163, "sampling/importance_sampling_ratio/max": 2.7047252655029297, "sampling/importance_sampling_ratio/mean": 0.6764511466026306, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4867992401123047, "sampling/sampling_logp_difference/mean": 0.014696327969431877, "step": 154, "step_time": 36.720785615965724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 737.0625610351562, "completions/mean_terminated_length": 320.6470642089844, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 1.0862603038549423, "epoch": 0.1449953227315248, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3107976019382477, "kl": 0.028913214337080717, "learning_rate": 8.559401309635173e-06, "loss": -0.0781, "num_tokens": 26029671.0, "reward": 0.7142857313156128, "reward_std": 0.453784316778183, "rewards/math_reward/mean": 0.7142857313156128, "rewards/math_reward/std": 0.4537842869758606, "sampling/importance_sampling_ratio/max": 2.7496912479400635, "sampling/importance_sampling_ratio/mean": 0.8344319462776184, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2102441787719727, "sampling/sampling_logp_difference/mean": 0.01682162843644619, "step": 155, "step_time": 31.216663601109758 }, { "clip_ratio/high_max": 0.00038973837217781693, "clip_ratio/high_mean": 5.567690868701902e-05, "clip_ratio/low_mean": 2.1981404643156566e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.765830787320738e-05, "completions/clipped_ratio": 0.2767857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 884.7857666015625, "completions/mean_terminated_length": 439.6049499511719, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 1.0605764836072922, "epoch": 0.14593077642656688, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.21676506102085114, "kl": 0.025946077425032854, "learning_rate": 8.550046772684753e-06, "loss": 0.0181, "num_tokens": 26208855.0, "reward": 0.6428571939468384, "reward_std": 0.4813109338283539, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.48131096363067627, "sampling/importance_sampling_ratio/max": 2.754822254180908, "sampling/importance_sampling_ratio/mean": 0.7598008513450623, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49041175842285156, "sampling/sampling_logp_difference/mean": 0.015894563868641853, "step": 156, "step_time": 34.066398312104866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 662.7232666015625, "completions/mean_terminated_length": 431.84375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.6106387376785278, "epoch": 0.146866230121609, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.36911821365356445, "kl": 0.02327269548550248, "learning_rate": 8.540692235734332e-06, "loss": -0.0145, "num_tokens": 26363992.0, "reward": 0.7500000596046448, "reward_std": 0.4349588453769684, "rewards/math_reward/mean": 0.75, "rewards/math_reward/std": 0.4349588453769684, "sampling/importance_sampling_ratio/max": 2.3987903594970703, "sampling/importance_sampling_ratio/mean": 0.7896709442138672, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6277182102203369, "sampling/sampling_logp_difference/mean": 0.013896108604967594, "step": 157, "step_time": 29.36071262904443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.267857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 849.107177734375, "completions/mean_terminated_length": 410.48779296875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.9541095942258835, "epoch": 0.14780168381665107, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.2841714024543762, "kl": 0.026035514194518328, "learning_rate": 8.531337698783911e-06, "loss": -0.0402, "num_tokens": 26540284.0, "reward": 0.6517857313156128, "reward_std": 0.47854551672935486, "rewards/math_reward/mean": 0.6517857313156128, "rewards/math_reward/std": 0.47854548692703247, "sampling/importance_sampling_ratio/max": 2.898285150527954, "sampling/importance_sampling_ratio/mean": 0.7211025357246399, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4959707260131836, "sampling/sampling_logp_difference/mean": 0.01680230163037777, "step": 158, "step_time": 34.72788548725657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2946428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 882.5357666015625, "completions/mean_terminated_length": 395.6961975097656, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.8875735253095627, "epoch": 0.14873713751169318, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2670033574104309, "kl": 0.024088594131171703, "learning_rate": 8.52198316183349e-06, "loss": 0.0555, "num_tokens": 26720944.0, "reward": 0.7053571939468384, "reward_std": 0.45793095231056213, "rewards/math_reward/mean": 0.7053571343421936, "rewards/math_reward/std": 0.45793095231056213, "sampling/importance_sampling_ratio/max": 2.5349950790405273, "sampling/importance_sampling_ratio/mean": 0.7338595986366272, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4698615074157715, "sampling/sampling_logp_difference/mean": 0.014917679131031036, "step": 159, "step_time": 34.27980098989792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 704.9732666015625, "completions/mean_terminated_length": 413.0108642578125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.8176027089357376, "epoch": 0.14967259120673526, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.683577835559845, "kl": 0.0267052692361176, "learning_rate": 8.512628624883068e-06, "loss": 0.0623, "num_tokens": 26881453.0, "reward": 0.7500000596046448, "reward_std": 0.4349588453769684, "rewards/math_reward/mean": 0.75, "rewards/math_reward/std": 0.4349588453769684, "sampling/importance_sampling_ratio/max": 2.7751193046569824, "sampling/importance_sampling_ratio/mean": 0.8025408983230591, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8094265460968018, "sampling/sampling_logp_difference/mean": 0.015720421448349953, "step": 160, "step_time": 29.817991209216416 }, { "clip_ratio/high_max": 0.0014360559907800052, "clip_ratio/high_mean": 0.00020515086134764715, "clip_ratio/low_mean": 7.595845818286762e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028110931953051477, "completions/clipped_ratio": 0.267857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 861.1517944335938, "completions/mean_terminated_length": 426.9389953613281, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.9665398448705673, "epoch": 0.15060804490177737, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.3287127912044525, "kl": 0.025796031579375267, "learning_rate": 8.503274087932648e-06, "loss": 0.0035, "num_tokens": 27055710.0, "reward": 0.6517857313156128, "reward_std": 0.47854551672935486, "rewards/math_reward/mean": 0.6517857313156128, "rewards/math_reward/std": 0.47854548692703247, "sampling/importance_sampling_ratio/max": 2.9752888679504395, "sampling/importance_sampling_ratio/mean": 0.7968011498451233, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.521457314491272, "sampling/sampling_logp_difference/mean": 0.016484329476952553, "step": 161, "step_time": 33.786919004051015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 777.1339721679688, "completions/mean_terminated_length": 430.53411865234375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.6801456809043884, "epoch": 0.15154349859681945, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.3901619017124176, "kl": 0.024004534352570772, "learning_rate": 8.493919550982227e-06, "loss": 0.1087, "num_tokens": 27224861.0, "reward": 0.6785714626312256, "reward_std": 0.46912387013435364, "rewards/math_reward/mean": 0.6785714030265808, "rewards/math_reward/std": 0.46912387013435364, "sampling/importance_sampling_ratio/max": 2.8539958000183105, "sampling/importance_sampling_ratio/mean": 0.7615135908126831, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3514490127563477, "sampling/sampling_logp_difference/mean": 0.014233079738914967, "step": 162, "step_time": 30.901729668024927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 716.482177734375, "completions/mean_terminated_length": 372.38201904296875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.8998202160000801, "epoch": 0.15247895229186156, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.4153711199760437, "kl": 0.026344268582761288, "learning_rate": 8.484565014031806e-06, "loss": 0.032, "num_tokens": 27387123.0, "reward": 0.8125000596046448, "reward_std": 0.39206662774086, "rewards/math_reward/mean": 0.8125, "rewards/math_reward/std": 0.3920665979385376, "sampling/importance_sampling_ratio/max": 2.910987615585327, "sampling/importance_sampling_ratio/mean": 0.9050115942955017, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4733572006225586, "sampling/sampling_logp_difference/mean": 0.015554400160908699, "step": 163, "step_time": 30.04746007011272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 929.4464721679688, "completions/mean_terminated_length": 538.62646484375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.9132333695888519, "epoch": 0.15341440598690365, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.3762611150741577, "kl": 0.025486175436526537, "learning_rate": 8.475210477081384e-06, "loss": -0.0306, "num_tokens": 27568701.0, "reward": 0.4732142984867096, "reward_std": 0.5015259385108948, "rewards/math_reward/mean": 0.4732142984867096, "rewards/math_reward/std": 0.5015259981155396, "sampling/importance_sampling_ratio/max": 2.664757490158081, "sampling/importance_sampling_ratio/mean": 0.8009408116340637, "sampling/importance_sampling_ratio/min": 0.0009677428752183914, "sampling/sampling_logp_difference/max": 0.49752283096313477, "sampling/sampling_logp_difference/mean": 0.015698250383138657, "step": 164, "step_time": 34.17310245707631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 867.8125610351562, "completions/mean_terminated_length": 511.0116271972656, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.635365329682827, "epoch": 0.15434985968194576, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.322332501411438, "kl": 0.02217627363279462, "learning_rate": 8.465855940130964e-06, "loss": 0.0565, "num_tokens": 27744792.0, "reward": 0.7053571939468384, "reward_std": 0.4579309821128845, "rewards/math_reward/mean": 0.7053571343421936, "rewards/math_reward/std": 0.45793095231056213, "sampling/importance_sampling_ratio/max": 2.50762939453125, "sampling/importance_sampling_ratio/mean": 0.8070951700210571, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7222657799720764, "sampling/sampling_logp_difference/mean": 0.012511450797319412, "step": 165, "step_time": 32.768498909892514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 723.2410888671875, "completions/mean_terminated_length": 486.178955078125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.595180869102478, "epoch": 0.15528531337698784, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.29130154848098755, "kl": 0.02145263273268938, "learning_rate": 8.456501403180543e-06, "loss": 0.0377, "num_tokens": 27900459.0, "reward": 0.392857164144516, "reward_std": 0.4905805289745331, "rewards/math_reward/mean": 0.3928571343421936, "rewards/math_reward/std": 0.4905804991722107, "sampling/importance_sampling_ratio/max": 2.5257554054260254, "sampling/importance_sampling_ratio/mean": 0.8352312445640564, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4312245845794678, "sampling/sampling_logp_difference/mean": 0.013022307306528091, "step": 166, "step_time": 30.01296095782891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 807.1160888671875, "completions/mean_terminated_length": 431.9651184082031, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.768914207816124, "epoch": 0.15622076707202995, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.40742743015289307, "kl": 0.024831526912748814, "learning_rate": 8.447146866230122e-06, "loss": 0.0449, "num_tokens": 28074232.0, "reward": 0.535714328289032, "reward_std": 0.500964343547821, "rewards/math_reward/mean": 0.5357142686843872, "rewards/math_reward/std": 0.5009642839431763, "sampling/importance_sampling_ratio/max": 2.891758680343628, "sampling/importance_sampling_ratio/mean": 0.7581639885902405, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5053536891937256, "sampling/sampling_logp_difference/mean": 0.015726685523986816, "step": 167, "step_time": 31.52071974775754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 675.9375, "completions/mean_terminated_length": 413.2021179199219, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.6914248168468475, "epoch": 0.15715622076707203, "frac_reward_zero_std": 0.5, "grad_norm": 0.44162699580192566, "kl": 0.024781267624348402, "learning_rate": 8.437792329279702e-06, "loss": 0.1529, "num_tokens": 28231033.0, "reward": 0.7410714626312256, "reward_std": 0.4400150775909424, "rewards/math_reward/mean": 0.7410714030265808, "rewards/math_reward/std": 0.44001504778862, "sampling/importance_sampling_ratio/max": 2.5679931640625, "sampling/importance_sampling_ratio/mean": 0.8151453733444214, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6013827323913574, "sampling/sampling_logp_difference/mean": 0.014546399004757404, "step": 168, "step_time": 29.68056544358842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 542.3660888671875, "completions/mean_terminated_length": 410.8058166503906, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5891341790556908, "epoch": 0.1580916744621141, "frac_reward_zero_std": 0.785714328289032, "grad_norm": 0.1863221526145935, "kl": 0.026385359931737185, "learning_rate": 8.428437792329281e-06, "loss": 0.0224, "num_tokens": 28383602.0, "reward": 0.7589285969734192, "reward_std": 0.4296559691429138, "rewards/math_reward/mean": 0.7589285969734192, "rewards/math_reward/std": 0.4296559691429138, "sampling/importance_sampling_ratio/max": 2.661207675933838, "sampling/importance_sampling_ratio/mean": 0.8892472386360168, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.484943151473999, "sampling/sampling_logp_difference/mean": 0.014185307547450066, "step": 169, "step_time": 28.734094778075814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 680.9017944335938, "completions/mean_terminated_length": 419.11700439453125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.6595591455698013, "epoch": 0.15902712815715622, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.38575878739356995, "kl": 0.02353997202590108, "learning_rate": 8.419083255378859e-06, "loss": 0.1008, "num_tokens": 28538599.0, "reward": 0.6696428656578064, "reward_std": 0.47245559096336365, "rewards/math_reward/mean": 0.6696428656578064, "rewards/math_reward/std": 0.47245559096336365, "sampling/importance_sampling_ratio/max": 2.2839794158935547, "sampling/importance_sampling_ratio/mean": 0.7977717518806458, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5156641006469727, "sampling/sampling_logp_difference/mean": 0.013336528092622757, "step": 170, "step_time": 29.48198289005086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 709.8839721679688, "completions/mean_terminated_length": 534.1716918945312, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.5642878860235214, "epoch": 0.1599625818521983, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.30105358362197876, "kl": 0.023757428862154484, "learning_rate": 8.409728718428438e-06, "loss": 0.0403, "num_tokens": 28701122.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.53397536277771, "sampling/importance_sampling_ratio/mean": 0.669143557548523, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.478515625, "sampling/sampling_logp_difference/mean": 0.01335019338876009, "step": 171, "step_time": 29.782577047124505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 655.2142944335938, "completions/mean_terminated_length": 405.9789733886719, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.5899539291858673, "epoch": 0.16089803554724041, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.35898998379707336, "kl": 0.0241028000600636, "learning_rate": 8.400374181478017e-06, "loss": -0.01, "num_tokens": 28858578.0, "reward": 0.6964285969734192, "reward_std": 0.4618662893772125, "rewards/math_reward/mean": 0.6964285969734192, "rewards/math_reward/std": 0.4618662893772125, "sampling/importance_sampling_ratio/max": 2.7312939167022705, "sampling/importance_sampling_ratio/mean": 0.8734442591667175, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47177696228027344, "sampling/sampling_logp_difference/mean": 0.01355829183012247, "step": 172, "step_time": 29.51934847724624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 647.5267944335938, "completions/mean_terminated_length": 495.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.6192868426442146, "epoch": 0.1618334892422825, "frac_reward_zero_std": 0.6428571939468384, "grad_norm": 0.27859535813331604, "kl": 0.023396455217152834, "learning_rate": 8.391019644527597e-06, "loss": 0.0575, "num_tokens": 29012613.0, "reward": 0.6964285969734192, "reward_std": 0.4618662893772125, "rewards/math_reward/mean": 0.6964285969734192, "rewards/math_reward/std": 0.4618662893772125, "sampling/importance_sampling_ratio/max": 2.629453420639038, "sampling/importance_sampling_ratio/mean": 0.771118700504303, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.434848427772522, "sampling/sampling_logp_difference/mean": 0.012663105502724648, "step": 173, "step_time": 28.36511080688797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1433.0, "completions/mean_length": 785.9107666015625, "completions/mean_terminated_length": 477.4000244140625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.7649337649345398, "epoch": 0.1627689429373246, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.4048639237880707, "kl": 0.026559588965028524, "learning_rate": 8.381665107577176e-06, "loss": 0.1251, "num_tokens": 29183331.0, "reward": 0.5267857313156128, "reward_std": 0.5015259981155396, "rewards/math_reward/mean": 0.5267857313156128, "rewards/math_reward/std": 0.5015259981155396, "sampling/importance_sampling_ratio/max": 2.8909687995910645, "sampling/importance_sampling_ratio/mean": 0.807773232460022, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4879450798034668, "sampling/sampling_logp_difference/mean": 0.015417885966598988, "step": 174, "step_time": 31.24701921385713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 617.375, "completions/mean_terminated_length": 378.9375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.7701066136360168, "epoch": 0.1637043966323667, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.30796554684638977, "kl": 0.025672041345387697, "learning_rate": 8.372310570626756e-06, "loss": -0.0385, "num_tokens": 29339525.0, "reward": 0.7589285969734192, "reward_std": 0.4296559691429138, "rewards/math_reward/mean": 0.7589285969734192, "rewards/math_reward/std": 0.4296559691429138, "sampling/importance_sampling_ratio/max": 2.435729742050171, "sampling/importance_sampling_ratio/mean": 0.8759179711341858, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4749314785003662, "sampling/sampling_logp_difference/mean": 0.01511604804545641, "step": 175, "step_time": 39.67815089202486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 741.1964721679688, "completions/mean_terminated_length": 474.2150573730469, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.6235623061656952, "epoch": 0.1646398503274088, "frac_reward_zero_std": 0.5, "grad_norm": 0.1822253167629242, "kl": 0.024313028901815414, "learning_rate": 8.362956033676333e-06, "loss": -0.0133, "num_tokens": 29502595.0, "reward": 0.6696428656578064, "reward_std": 0.47245559096336365, "rewards/math_reward/mean": 0.6696428656578064, "rewards/math_reward/std": 0.47245559096336365, "sampling/importance_sampling_ratio/max": 2.7363059520721436, "sampling/importance_sampling_ratio/mean": 0.7053349614143372, "sampling/importance_sampling_ratio/min": 0.012644844129681587, "sampling/sampling_logp_difference/max": 0.4962639808654785, "sampling/sampling_logp_difference/mean": 0.01261897198855877, "step": 176, "step_time": 30.94971025106497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 740.1607666015625, "completions/mean_terminated_length": 506.1263427734375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.7602481991052628, "epoch": 0.16557530402245088, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.30973416566848755, "kl": 0.025644636247307062, "learning_rate": 8.353601496725913e-06, "loss": 0.0241, "num_tokens": 29666733.0, "reward": 0.5089285969734192, "reward_std": 0.5021671652793884, "rewards/math_reward/mean": 0.5089285969734192, "rewards/math_reward/std": 0.5021671056747437, "sampling/importance_sampling_ratio/max": 2.6582436561584473, "sampling/importance_sampling_ratio/mean": 0.8408647179603577, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5624350309371948, "sampling/sampling_logp_difference/mean": 0.013748398050665855, "step": 177, "step_time": 30.469244458014145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 720.0714721679688, "completions/mean_terminated_length": 448.7742004394531, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.7096895575523376, "epoch": 0.166510757717493, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.5292096138000488, "kl": 0.026505134999752045, "learning_rate": 8.344246959775492e-06, "loss": 0.1152, "num_tokens": 29826813.0, "reward": 0.6875000596046448, "reward_std": 0.46559563279151917, "rewards/math_reward/mean": 0.6875, "rewards/math_reward/std": 0.4655956029891968, "sampling/importance_sampling_ratio/max": 2.9277637004852295, "sampling/importance_sampling_ratio/mean": 0.8003605604171753, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4837663173675537, "sampling/sampling_logp_difference/mean": 0.013810754753649235, "step": 178, "step_time": 30.142088359221816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 762.6250610351562, "completions/mean_terminated_length": 548.3958740234375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.7947532683610916, "epoch": 0.16744621141253507, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.6339012384414673, "kl": 0.02477271156385541, "learning_rate": 8.33489242282507e-06, "loss": 0.1519, "num_tokens": 29994227.0, "reward": 0.6339285969734192, "reward_std": 0.483894407749176, "rewards/math_reward/mean": 0.6339285969734192, "rewards/math_reward/std": 0.483894407749176, "sampling/importance_sampling_ratio/max": 2.682011365890503, "sampling/importance_sampling_ratio/mean": 0.8128923177719116, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4779093265533447, "sampling/sampling_logp_difference/mean": 0.01618090830743313, "step": 179, "step_time": 30.753209363203496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 695.919677734375, "completions/mean_terminated_length": 453.96844482421875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.5392503589391708, "epoch": 0.16838166510757718, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3631647527217865, "kl": 0.023528020828962326, "learning_rate": 8.325537885874649e-06, "loss": 0.0349, "num_tokens": 30163474.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.6883046627044678, "sampling/importance_sampling_ratio/mean": 0.8575841784477234, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4982004165649414, "sampling/sampling_logp_difference/mean": 0.011770595796406269, "step": 180, "step_time": 31.837908050976694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 806.9017944335938, "completions/mean_terminated_length": 468.42047119140625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.8638487458229065, "epoch": 0.16931711880261927, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.374946266412735, "kl": 0.02801767038181424, "learning_rate": 8.31618334892423e-06, "loss": -0.0023, "num_tokens": 30338815.0, "reward": 0.5625, "reward_std": 0.4983079433441162, "rewards/math_reward/mean": 0.5625, "rewards/math_reward/std": 0.4983079433441162, "sampling/importance_sampling_ratio/max": 2.820953130722046, "sampling/importance_sampling_ratio/mean": 0.8103893995285034, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49179649353027344, "sampling/sampling_logp_difference/mean": 0.015578866936266422, "step": 181, "step_time": 32.48509966954589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 721.357177734375, "completions/mean_terminated_length": 415.20880126953125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.6711142361164093, "epoch": 0.17025257249766138, "frac_reward_zero_std": 0.5, "grad_norm": 0.8094620108604431, "kl": 0.02350167464464903, "learning_rate": 8.306828811973808e-06, "loss": 0.3422, "num_tokens": 30501583.0, "reward": 0.625, "reward_std": 0.4862987697124481, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.784736394882202, "sampling/importance_sampling_ratio/mean": 0.8802376985549927, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5543994903564453, "sampling/sampling_logp_difference/mean": 0.013557830825448036, "step": 182, "step_time": 29.990006940905005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 718.5803833007812, "completions/mean_terminated_length": 375.0224609375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.8837331235408783, "epoch": 0.17118802619270346, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.22939668595790863, "kl": 0.028022305574268103, "learning_rate": 8.297474275023387e-06, "loss": 0.027, "num_tokens": 30662168.0, "reward": 0.625, "reward_std": 0.4862987995147705, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.9617435932159424, "sampling/importance_sampling_ratio/mean": 0.7606738209724426, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48230886459350586, "sampling/sampling_logp_difference/mean": 0.014457191340625286, "step": 183, "step_time": 29.713456886122003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 508.919677734375, "completions/mean_terminated_length": 324.22998046875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.6727949231863022, "epoch": 0.17212347988774557, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 1.0096005201339722, "kl": 0.02973073720932007, "learning_rate": 8.288119738072967e-06, "loss": 0.1165, "num_tokens": 30811959.0, "reward": 0.6875000596046448, "reward_std": 0.46559563279151917, "rewards/math_reward/mean": 0.6875, "rewards/math_reward/std": 0.4655956029891968, "sampling/importance_sampling_ratio/max": 2.467546224594116, "sampling/importance_sampling_ratio/mean": 0.8354155421257019, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49233579635620117, "sampling/sampling_logp_difference/mean": 0.01363893412053585, "step": 184, "step_time": 28.726667006732896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 828.7410888671875, "completions/mean_terminated_length": 478.3793029785156, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.8747973293066025, "epoch": 0.17305893358278765, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.5126954317092896, "kl": 0.02592736016958952, "learning_rate": 8.278765201122544e-06, "loss": 0.0429, "num_tokens": 30984250.0, "reward": 0.7053571939468384, "reward_std": 0.4579309821128845, "rewards/math_reward/mean": 0.7053571343421936, "rewards/math_reward/std": 0.45793095231056213, "sampling/importance_sampling_ratio/max": 2.982715368270874, "sampling/importance_sampling_ratio/mean": 0.7718594670295715, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5114128589630127, "sampling/sampling_logp_difference/mean": 0.014754974283277988, "step": 185, "step_time": 31.605789189226925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 722.6517944335938, "completions/mean_terminated_length": 416.8022155761719, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.8115908429026604, "epoch": 0.17399438727782976, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.4533534049987793, "kl": 0.02645656792446971, "learning_rate": 8.269410664172124e-06, "loss": -0.0551, "num_tokens": 31145107.0, "reward": 0.7410714626312256, "reward_std": 0.4400150775909424, "rewards/math_reward/mean": 0.7410714030265808, "rewards/math_reward/std": 0.44001504778862, "sampling/importance_sampling_ratio/max": 2.523991346359253, "sampling/importance_sampling_ratio/mean": 0.8044648766517639, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.590754508972168, "sampling/sampling_logp_difference/mean": 0.014949284493923187, "step": 186, "step_time": 30.28801411227323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 761.7053833007812, "completions/mean_terminated_length": 498.9139709472656, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.7752171605825424, "epoch": 0.17492984097287184, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.27746331691741943, "kl": 0.027261618059128523, "learning_rate": 8.260056127221703e-06, "loss": -0.032, "num_tokens": 31308962.0, "reward": 0.5625, "reward_std": 0.4983079731464386, "rewards/math_reward/mean": 0.5625, "rewards/math_reward/std": 0.4983079433441162, "sampling/importance_sampling_ratio/max": 2.7251856327056885, "sampling/importance_sampling_ratio/mean": 0.7299097776412964, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46177124977111816, "sampling/sampling_logp_difference/mean": 0.014724517241120338, "step": 187, "step_time": 30.685925089055672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1729.0, "completions/mean_length": 823.8214721679688, "completions/mean_terminated_length": 507.460693359375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.7617797255516052, "epoch": 0.17586529466791395, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.20970188081264496, "kl": 0.02455508802086115, "learning_rate": 8.250701590271282e-06, "loss": -0.0552, "num_tokens": 31481478.0, "reward": 0.5089285969734192, "reward_std": 0.5021671056747437, "rewards/math_reward/mean": 0.5089285969734192, "rewards/math_reward/std": 0.5021671056747437, "sampling/importance_sampling_ratio/max": 2.823094367980957, "sampling/importance_sampling_ratio/mean": 0.6860451102256775, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48262453079223633, "sampling/sampling_logp_difference/mean": 0.013653882779181004, "step": 188, "step_time": 31.61504614329897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 833.0803833007812, "completions/mean_terminated_length": 447.1647033691406, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.9269585311412811, "epoch": 0.17680074836295603, "frac_reward_zero_std": 0.5, "grad_norm": 0.3151915669441223, "kl": 0.028682213742285967, "learning_rate": 8.241347053320862e-06, "loss": 0.044, "num_tokens": 31654935.0, "reward": 0.5446428656578064, "reward_std": 0.5002412796020508, "rewards/math_reward/mean": 0.5446428656578064, "rewards/math_reward/std": 0.5002412796020508, "sampling/importance_sampling_ratio/max": 2.977029323577881, "sampling/importance_sampling_ratio/mean": 0.7892724871635437, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7165570259094238, "sampling/sampling_logp_difference/mean": 0.01676938869059086, "step": 189, "step_time": 32.74135177745484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.133928582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 623.169677734375, "completions/mean_terminated_length": 402.8350524902344, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.6988023295998573, "epoch": 0.17773620205799812, "frac_reward_zero_std": 0.5, "grad_norm": 0.3618289530277252, "kl": 0.02831080975010991, "learning_rate": 8.231992516370441e-06, "loss": -0.0439, "num_tokens": 31805058.0, "reward": 0.6339285969734192, "reward_std": 0.483894407749176, "rewards/math_reward/mean": 0.6339285969734192, "rewards/math_reward/std": 0.483894407749176, "sampling/importance_sampling_ratio/max": 2.7559854984283447, "sampling/importance_sampling_ratio/mean": 0.7891696691513062, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47434020042419434, "sampling/sampling_logp_difference/mean": 0.013960640877485275, "step": 190, "step_time": 28.534475398715585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 798.3035888671875, "completions/mean_terminated_length": 401.3411865234375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.983073428273201, "epoch": 0.17867165575304023, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.41081690788269043, "kl": 0.029110560193657875, "learning_rate": 8.222637979420019e-06, "loss": 0.0528, "num_tokens": 31974508.0, "reward": 0.6160714626312256, "reward_std": 0.48852667212486267, "rewards/math_reward/mean": 0.6160714030265808, "rewards/math_reward/std": 0.4885266423225403, "sampling/importance_sampling_ratio/max": 2.9028289318084717, "sampling/importance_sampling_ratio/mean": 0.8084117770195007, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5276827216148376, "sampling/sampling_logp_difference/mean": 0.017200341448187828, "step": 191, "step_time": 31.623337594093755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 784.3750610351562, "completions/mean_terminated_length": 492.7692565917969, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.6908202022314072, "epoch": 0.1796071094480823, "frac_reward_zero_std": 0.6428571939468384, "grad_norm": 0.17531302571296692, "kl": 0.025823537725955248, "learning_rate": 8.213283442469598e-06, "loss": 0.0235, "num_tokens": 32139406.0, "reward": 0.4821428656578064, "reward_std": 0.5019267797470093, "rewards/math_reward/mean": 0.4821428656578064, "rewards/math_reward/std": 0.5019267797470093, "sampling/importance_sampling_ratio/max": 2.6976797580718994, "sampling/importance_sampling_ratio/mean": 0.7222722172737122, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.44435572624206543, "sampling/sampling_logp_difference/mean": 0.014040273614227772, "step": 192, "step_time": 30.79078445211053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 821.6517944335938, "completions/mean_terminated_length": 487.1932067871094, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.7249132692813873, "epoch": 0.18054256314312442, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.24985502660274506, "kl": 0.025139047298580408, "learning_rate": 8.203928905519177e-06, "loss": -0.0123, "num_tokens": 32312231.0, "reward": 0.5535714626312256, "reward_std": 0.49935609102249146, "rewards/math_reward/mean": 0.5535714030265808, "rewards/math_reward/std": 0.49935612082481384, "sampling/importance_sampling_ratio/max": 2.53873872756958, "sampling/importance_sampling_ratio/mean": 0.733741283416748, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6359837651252747, "sampling/sampling_logp_difference/mean": 0.013347267173230648, "step": 193, "step_time": 31.658806477906182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 762.9107666015625, "completions/mean_terminated_length": 500.3656005859375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.8087012469768524, "epoch": 0.1814780168381665, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.34275180101394653, "kl": 0.02840651012957096, "learning_rate": 8.194574368568757e-06, "loss": 0.0446, "num_tokens": 32484717.0, "reward": 0.5714285969734192, "reward_std": 0.4970957934856415, "rewards/math_reward/mean": 0.5714285969734192, "rewards/math_reward/std": 0.4970957934856415, "sampling/importance_sampling_ratio/max": 2.6891636848449707, "sampling/importance_sampling_ratio/mean": 0.7249727249145508, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46558403968811035, "sampling/sampling_logp_difference/mean": 0.01595185324549675, "step": 194, "step_time": 34.00950350589119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 684.544677734375, "completions/mean_terminated_length": 457.3020935058594, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.6239446476101875, "epoch": 0.1824134705332086, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.4104067385196686, "kl": 0.025510120671242476, "learning_rate": 8.185219831618335e-06, "loss": 0.1739, "num_tokens": 32642338.0, "reward": 0.6785714626312256, "reward_std": 0.469123899936676, "rewards/math_reward/mean": 0.6785714030265808, "rewards/math_reward/std": 0.46912387013435364, "sampling/importance_sampling_ratio/max": 2.625528335571289, "sampling/importance_sampling_ratio/mean": 0.8169478178024292, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4897456169128418, "sampling/sampling_logp_difference/mean": 0.01265394501388073, "step": 195, "step_time": 29.708164103794843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 630.2232666015625, "completions/mean_terminated_length": 358.7340393066406, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6191665306687355, "epoch": 0.1833489242282507, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.4809143841266632, "kl": 0.02646159427240491, "learning_rate": 8.175865294667916e-06, "loss": 0.1309, "num_tokens": 32795763.0, "reward": 0.8392857313156128, "reward_std": 0.36891788244247437, "rewards/math_reward/mean": 0.8392857313156128, "rewards/math_reward/std": 0.368917852640152, "sampling/importance_sampling_ratio/max": 2.5771284103393555, "sampling/importance_sampling_ratio/mean": 0.9277664422988892, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4655327796936035, "sampling/sampling_logp_difference/mean": 0.013422984629869461, "step": 196, "step_time": 29.183647160883993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 743.8392944335938, "completions/mean_terminated_length": 442.8791198730469, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.7222148030996323, "epoch": 0.1842843779232928, "frac_reward_zero_std": 0.5, "grad_norm": 0.3630943298339844, "kl": 0.02628003992140293, "learning_rate": 8.166510757717493e-06, "loss": -0.0024, "num_tokens": 32960985.0, "reward": 0.625, "reward_std": 0.4862987697124481, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.965972661972046, "sampling/importance_sampling_ratio/mean": 0.7668924331665039, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5053701400756836, "sampling/sampling_logp_difference/mean": 0.01521432213485241, "step": 197, "step_time": 30.933369209989905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 676.0803833007812, "completions/mean_terminated_length": 447.4270935058594, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5964919775724411, "epoch": 0.18521983161833488, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.4771101176738739, "kl": 0.024993038270622492, "learning_rate": 8.157156220767073e-06, "loss": 0.059, "num_tokens": 33118042.0, "reward": 0.6696428656578064, "reward_std": 0.47245559096336365, "rewards/math_reward/mean": 0.6696428656578064, "rewards/math_reward/std": 0.47245559096336365, "sampling/importance_sampling_ratio/max": 2.8549578189849854, "sampling/importance_sampling_ratio/mean": 0.7680121660232544, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4976081848144531, "sampling/sampling_logp_difference/mean": 0.01447400264441967, "step": 198, "step_time": 29.13064423087053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 549.875, "completions/mean_terminated_length": 353.1515197753906, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.43840092420578003, "epoch": 0.186155285313377, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.3532346487045288, "kl": 0.027163101360201836, "learning_rate": 8.147801683816652e-06, "loss": 0.009, "num_tokens": 33270404.0, "reward": 0.6964285969734192, "reward_std": 0.4618662893772125, "rewards/math_reward/mean": 0.6964285969734192, "rewards/math_reward/std": 0.4618663191795349, "sampling/importance_sampling_ratio/max": 2.798539161682129, "sampling/importance_sampling_ratio/mean": 0.8714045286178589, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.488955020904541, "sampling/sampling_logp_difference/mean": 0.011505073867738247, "step": 199, "step_time": 30.230362999718636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 657.8035888671875, "completions/mean_terminated_length": 490.97998046875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.6085770428180695, "epoch": 0.18709073900841908, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.3072863519191742, "kl": 0.024996183812618256, "learning_rate": 8.138447146866231e-06, "loss": -0.0138, "num_tokens": 33430014.0, "reward": 0.4285714626312256, "reward_std": 0.4970957934856415, "rewards/math_reward/mean": 0.4285714328289032, "rewards/math_reward/std": 0.4970957934856415, "sampling/importance_sampling_ratio/max": 2.593538522720337, "sampling/importance_sampling_ratio/mean": 0.7211729884147644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4707505702972412, "sampling/sampling_logp_difference/mean": 0.013759834691882133, "step": 200, "step_time": 29.256856630789116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 717.3750610351562, "completions/mean_terminated_length": 479.26318359375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.6768014132976532, "epoch": 0.1880261927034612, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3202826976776123, "kl": 0.02607622090727091, "learning_rate": 8.129092609915809e-06, "loss": 0.0577, "num_tokens": 33593200.0, "reward": 0.598214328289032, "reward_std": 0.49246248602867126, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.49246248602867126, "sampling/importance_sampling_ratio/max": 2.468071222305298, "sampling/importance_sampling_ratio/mean": 0.6956700086593628, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4205336570739746, "sampling/sampling_logp_difference/mean": 0.01505863107740879, "step": 201, "step_time": 30.074809982208535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142873108387, "completions/max_length": 2048.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 522.3303833007812, "completions/mean_terminated_length": 465.8240661621094, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.46554064750671387, "epoch": 0.18896164639850327, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.2618546485900879, "kl": 0.02175452932715416, "learning_rate": 8.119738072965388e-06, "loss": 0.0307, "num_tokens": 33730909.0, "reward": 0.625, "reward_std": 0.4862987995147705, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.1252710819244385, "sampling/importance_sampling_ratio/mean": 0.7948617935180664, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4301069974899292, "sampling/sampling_logp_difference/mean": 0.011083896271884441, "step": 202, "step_time": 26.565124013926834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 575.857177734375, "completions/mean_terminated_length": 382.5454406738281, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.6991497054696083, "epoch": 0.18989710009354538, "frac_reward_zero_std": 0.7142857313156128, "grad_norm": 0.19388268887996674, "kl": 0.026379226241260767, "learning_rate": 8.110383536014968e-06, "loss": 0.0158, "num_tokens": 33882117.0, "reward": 0.6964285969734192, "reward_std": 0.4618662893772125, "rewards/math_reward/mean": 0.6964285969734192, "rewards/math_reward/std": 0.4618662893772125, "sampling/importance_sampling_ratio/max": 2.3921926021575928, "sampling/importance_sampling_ratio/mean": 0.7844367027282715, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.379119873046875, "sampling/sampling_logp_difference/mean": 0.014256101101636887, "step": 203, "step_time": 28.463696314021945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285746216774, "completions/max_length": 2048.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 471.9375305175781, "completions/mean_terminated_length": 350.7019348144531, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.40427855402231216, "epoch": 0.19083255378858746, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.3888010084629059, "kl": 0.02634357614442706, "learning_rate": 8.101028999064547e-06, "loss": 0.0368, "num_tokens": 34022406.0, "reward": 0.8214285969734192, "reward_std": 0.38471439480781555, "rewards/math_reward/mean": 0.8214285969734192, "rewards/math_reward/std": 0.38471439480781555, "sampling/importance_sampling_ratio/max": 2.70680570602417, "sampling/importance_sampling_ratio/mean": 0.8671826720237732, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4769926071166992, "sampling/sampling_logp_difference/mean": 0.011462206020951271, "step": 204, "step_time": 25.89511924586259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 846.5178833007812, "completions/mean_terminated_length": 585.3261108398438, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.5903217494487762, "epoch": 0.19176800748362957, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.32335615158081055, "kl": 0.022261875215917826, "learning_rate": 8.091674462114127e-06, "loss": 0.0379, "num_tokens": 34202008.0, "reward": 0.5178571939468384, "reward_std": 0.5019267797470093, "rewards/math_reward/mean": 0.5178571343421936, "rewards/math_reward/std": 0.5019267797470093, "sampling/importance_sampling_ratio/max": 2.885838031768799, "sampling/importance_sampling_ratio/mean": 0.7571255564689636, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4402942657470703, "sampling/sampling_logp_difference/mean": 0.012495742179453373, "step": 205, "step_time": 32.80168977379799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1490.0, "completions/mean_length": 701.482177734375, "completions/mean_terminated_length": 524.6666870117188, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.5323827266693115, "epoch": 0.19270346117867165, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.41652530431747437, "kl": 0.024085056968033314, "learning_rate": 8.082319925163706e-06, "loss": -0.0448, "num_tokens": 34358798.0, "reward": 0.5178571939468384, "reward_std": 0.5019267797470093, "rewards/math_reward/mean": 0.5178571343421936, "rewards/math_reward/std": 0.5019267797470093, "sampling/importance_sampling_ratio/max": 2.452910900115967, "sampling/importance_sampling_ratio/mean": 0.8079304695129395, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6287345886230469, "sampling/sampling_logp_difference/mean": 0.012786011211574078, "step": 206, "step_time": 29.072844298090786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 629.3303833007812, "completions/mean_terminated_length": 474.82177734375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.49753373116254807, "epoch": 0.19363891487371376, "frac_reward_zero_std": 0.5, "grad_norm": 0.5014228820800781, "kl": 0.02328176610171795, "learning_rate": 8.072965388213284e-06, "loss": 0.1022, "num_tokens": 34511923.0, "reward": 0.7142857313156128, "reward_std": 0.453784316778183, "rewards/math_reward/mean": 0.7142857313156128, "rewards/math_reward/std": 0.4537842869758606, "sampling/importance_sampling_ratio/max": 2.703565835952759, "sampling/importance_sampling_ratio/mean": 0.7592502236366272, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47545504570007324, "sampling/sampling_logp_difference/mean": 0.011615600436925888, "step": 207, "step_time": 28.733781185233966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 609.794677734375, "completions/mean_terminated_length": 437.2099914550781, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.5412189960479736, "epoch": 0.19457436856875585, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.5402888655662537, "kl": 0.024904830381274223, "learning_rate": 8.063610851262863e-06, "loss": -0.1207, "num_tokens": 34659836.0, "reward": 0.8214285969734192, "reward_std": 0.38471439480781555, "rewards/math_reward/mean": 0.8214285969734192, "rewards/math_reward/std": 0.38471439480781555, "sampling/importance_sampling_ratio/max": 2.902747631072998, "sampling/importance_sampling_ratio/mean": 0.8663859963417053, "sampling/importance_sampling_ratio/min": 0.016940150409936905, "sampling/sampling_logp_difference/max": 0.4753611087799072, "sampling/sampling_logp_difference/mean": 0.01328996941447258, "step": 208, "step_time": 28.099447982152924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 781.3750610351562, "completions/mean_terminated_length": 489.0769348144531, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.7524631917476654, "epoch": 0.19550982226379796, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.24624678492546082, "kl": 0.027045555412769318, "learning_rate": 8.054256314312442e-06, "loss": 0.1002, "num_tokens": 34829358.0, "reward": 0.6071428656578064, "reward_std": 0.4905804991722107, "rewards/math_reward/mean": 0.6071428656578064, "rewards/math_reward/std": 0.4905804991722107, "sampling/importance_sampling_ratio/max": 2.7733685970306396, "sampling/importance_sampling_ratio/mean": 0.6206665635108948, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46111226081848145, "sampling/sampling_logp_difference/mean": 0.015758467838168144, "step": 209, "step_time": 30.997832712018862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00010083444976771716, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00010083444976771716, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 699.6785888671875, "completions/mean_terminated_length": 406.5652160644531, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.6144600212574005, "epoch": 0.19644527595884004, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.3717489242553711, "kl": 0.02484105108305812, "learning_rate": 8.04490177736202e-06, "loss": 0.0615, "num_tokens": 34991090.0, "reward": 0.6696428656578064, "reward_std": 0.47245559096336365, "rewards/math_reward/mean": 0.6696428656578064, "rewards/math_reward/std": 0.47245559096336365, "sampling/importance_sampling_ratio/max": 2.9776813983917236, "sampling/importance_sampling_ratio/mean": 0.8282397389411926, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6154723167419434, "sampling/sampling_logp_difference/mean": 0.012724103406071663, "step": 210, "step_time": 29.90504337917082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 686.7410888671875, "completions/mean_terminated_length": 459.8645935058594, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5462876036763191, "epoch": 0.19738072965388212, "frac_reward_zero_std": 0.5, "grad_norm": 0.2944011688232422, "kl": 0.02452030871063471, "learning_rate": 8.035547240411601e-06, "loss": -0.0396, "num_tokens": 35150197.0, "reward": 0.6696428656578064, "reward_std": 0.47245559096336365, "rewards/math_reward/mean": 0.6696428656578064, "rewards/math_reward/std": 0.47245559096336365, "sampling/importance_sampling_ratio/max": 2.8176538944244385, "sampling/importance_sampling_ratio/mean": 0.8566023111343384, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8092448711395264, "sampling/sampling_logp_difference/mean": 0.013469386845827103, "step": 211, "step_time": 29.46310770791024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 938.107177734375, "completions/mean_terminated_length": 602.5581665039062, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.7287794649600983, "epoch": 0.19831618334892423, "frac_reward_zero_std": 0.5, "grad_norm": 0.30058544874191284, "kl": 0.025761007331311703, "learning_rate": 8.02619270346118e-06, "loss": 0.03, "num_tokens": 35339169.0, "reward": 0.5089285969734192, "reward_std": 0.5021671056747437, "rewards/math_reward/mean": 0.5089285969734192, "rewards/math_reward/std": 0.5021671056747437, "sampling/importance_sampling_ratio/max": 2.8624813556671143, "sampling/importance_sampling_ratio/mean": 0.7228183746337891, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7095813751220703, "sampling/sampling_logp_difference/mean": 0.013972180895507336, "step": 212, "step_time": 36.05892783100717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 785.357177734375, "completions/mean_terminated_length": 441.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.6610443592071533, "epoch": 0.1992516370439663, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2865287959575653, "kl": 0.02523738332092762, "learning_rate": 8.016838166510758e-06, "loss": -0.0129, "num_tokens": 35514169.0, "reward": 0.6428571939468384, "reward_std": 0.4813109338283539, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.4813109338283539, "sampling/importance_sampling_ratio/max": 2.8075766563415527, "sampling/importance_sampling_ratio/mean": 0.8301560282707214, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5470852851867676, "sampling/sampling_logp_difference/mean": 0.013621282763779163, "step": 213, "step_time": 33.56480591115542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 667.8125, "completions/mean_terminated_length": 437.78125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.7671009749174118, "epoch": 0.20018709073900842, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3440057635307312, "kl": 0.028729927260428667, "learning_rate": 8.007483629560337e-06, "loss": 0.054, "num_tokens": 35673148.0, "reward": 0.6160714626312256, "reward_std": 0.48852667212486267, "rewards/math_reward/mean": 0.6160714030265808, "rewards/math_reward/std": 0.4885266423225403, "sampling/importance_sampling_ratio/max": 2.4070281982421875, "sampling/importance_sampling_ratio/mean": 0.7437207102775574, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48508524894714355, "sampling/sampling_logp_difference/mean": 0.015343215316534042, "step": 214, "step_time": 29.622278926428407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 812.1428833007812, "completions/mean_terminated_length": 400.19049072265625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.8468696773052216, "epoch": 0.2011225444340505, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3766304850578308, "kl": 0.0254323142580688, "learning_rate": 7.998129092609917e-06, "loss": 0.044, "num_tokens": 35845620.0, "reward": 0.7321428656578064, "reward_std": 0.44483307003974915, "rewards/math_reward/mean": 0.7321428656578064, "rewards/math_reward/std": 0.44483304023742676, "sampling/importance_sampling_ratio/max": 2.8767025470733643, "sampling/importance_sampling_ratio/mean": 0.7848688960075378, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5993518829345703, "sampling/sampling_logp_difference/mean": 0.016010386869311333, "step": 215, "step_time": 33.03382110223174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 681.5089721679688, "completions/mean_terminated_length": 419.8404235839844, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.58535485714674, "epoch": 0.20205799812909261, "frac_reward_zero_std": 0.5, "grad_norm": 0.3262876868247986, "kl": 0.025864411145448685, "learning_rate": 7.988774555659495e-06, "loss": 0.0325, "num_tokens": 36006829.0, "reward": 0.6964285969734192, "reward_std": 0.4618662893772125, "rewards/math_reward/mean": 0.6964285969734192, "rewards/math_reward/std": 0.4618662893772125, "sampling/importance_sampling_ratio/max": 2.595747947692871, "sampling/importance_sampling_ratio/mean": 0.8117945790290833, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4761495590209961, "sampling/sampling_logp_difference/mean": 0.012795633636415005, "step": 216, "step_time": 31.295409915270284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 656.1785888671875, "completions/mean_terminated_length": 389.6595764160156, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.6510616093873978, "epoch": 0.2029934518241347, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.34502318501472473, "kl": 0.02613558480516076, "learning_rate": 7.979420018709074e-06, "loss": 0.054, "num_tokens": 36161585.0, "reward": 0.7053571939468384, "reward_std": 0.4579309821128845, "rewards/math_reward/mean": 0.7053571343421936, "rewards/math_reward/std": 0.45793095231056213, "sampling/importance_sampling_ratio/max": 2.24603533744812, "sampling/importance_sampling_ratio/mean": 0.6820634603500366, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6239591836929321, "sampling/sampling_logp_difference/mean": 0.013901499100029469, "step": 217, "step_time": 29.388130586827174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 929.5089721679688, "completions/mean_terminated_length": 482.1125183105469, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.8778023719787598, "epoch": 0.2039289055191768, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.45575734972953796, "kl": 0.02701822714880109, "learning_rate": 7.970065481758653e-06, "loss": 0.1093, "num_tokens": 36346378.0, "reward": 0.455357164144516, "reward_std": 0.5002412796020508, "rewards/math_reward/mean": 0.4553571343421936, "rewards/math_reward/std": 0.5002412796020508, "sampling/importance_sampling_ratio/max": 2.8267979621887207, "sampling/importance_sampling_ratio/mean": 0.6517370939254761, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48536252975463867, "sampling/sampling_logp_difference/mean": 0.01573195680975914, "step": 218, "step_time": 35.01591835776344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 793.0178833007812, "completions/mean_terminated_length": 468.6966247558594, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.6770185977220535, "epoch": 0.2048643592142189, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.34233853220939636, "kl": 0.025879542343318462, "learning_rate": 7.960710944808233e-06, "loss": 0.0344, "num_tokens": 36517828.0, "reward": 0.625, "reward_std": 0.4862987995147705, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.7544217109680176, "sampling/importance_sampling_ratio/mean": 0.7397624254226685, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4800913333892822, "sampling/sampling_logp_difference/mean": 0.01408898551017046, "step": 219, "step_time": 32.338837005896494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 793.6250610351562, "completions/mean_terminated_length": 451.5227355957031, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.7589618414640427, "epoch": 0.205799812909261, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.3107793629169464, "kl": 0.02731961850076914, "learning_rate": 7.951356407857812e-06, "loss": 0.0659, "num_tokens": 36687330.0, "reward": 0.660714328289032, "reward_std": 0.4755948781967163, "rewards/math_reward/mean": 0.6607142686843872, "rewards/math_reward/std": 0.4755948781967163, "sampling/importance_sampling_ratio/max": 2.9971935749053955, "sampling/importance_sampling_ratio/mean": 0.7842918634414673, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7089567184448242, "sampling/sampling_logp_difference/mean": 0.014777058735489845, "step": 220, "step_time": 31.68027190072462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 659.5892944335938, "completions/mean_terminated_length": 411.1368713378906, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.8520530685782433, "epoch": 0.20673526660430308, "frac_reward_zero_std": 0.5, "grad_norm": 0.22690655291080475, "kl": 0.025207086466252804, "learning_rate": 7.942001870907391e-06, "loss": -0.025, "num_tokens": 36843500.0, "reward": 0.8035714626312256, "reward_std": 0.3990819752216339, "rewards/math_reward/mean": 0.8035714030265808, "rewards/math_reward/std": 0.3990819454193115, "sampling/importance_sampling_ratio/max": 2.889286756515503, "sampling/importance_sampling_ratio/mean": 0.792033314704895, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4758768081665039, "sampling/sampling_logp_difference/mean": 0.016141299158334732, "step": 221, "step_time": 30.3738061520271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 701.6785888671875, "completions/mean_terminated_length": 443.872314453125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.7058630883693695, "epoch": 0.2076707202993452, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.41154834628105164, "kl": 0.025662725791335106, "learning_rate": 7.932647333956969e-06, "loss": 0.1608, "num_tokens": 37004008.0, "reward": 0.6428571939468384, "reward_std": 0.4813109338283539, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.48131096363067627, "sampling/importance_sampling_ratio/max": 2.5562736988067627, "sampling/importance_sampling_ratio/mean": 0.8116022348403931, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6187430620193481, "sampling/sampling_logp_difference/mean": 0.014655635692179203, "step": 222, "step_time": 30.173322034068406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 610.2053833007812, "completions/mean_terminated_length": 316.4623718261719, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.8246620148420334, "epoch": 0.20860617399438727, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.35155895352363586, "kl": 0.03128030803054571, "learning_rate": 7.923292797006548e-06, "loss": 0.0568, "num_tokens": 37158191.0, "reward": 0.7589285969734192, "reward_std": 0.4296559691429138, "rewards/math_reward/mean": 0.7589285969734192, "rewards/math_reward/std": 0.4296559691429138, "sampling/importance_sampling_ratio/max": 2.8645966053009033, "sampling/importance_sampling_ratio/mean": 0.7886923551559448, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46312904357910156, "sampling/sampling_logp_difference/mean": 0.015887979418039322, "step": 223, "step_time": 28.44103448977694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 727.9910888671875, "completions/mean_terminated_length": 458.31182861328125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.6562155932188034, "epoch": 0.20954162768942938, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3402688503265381, "kl": 0.024637558963149786, "learning_rate": 7.913938260056128e-06, "loss": -0.0803, "num_tokens": 37320590.0, "reward": 0.6785714626312256, "reward_std": 0.46912387013435364, "rewards/math_reward/mean": 0.6785714030265808, "rewards/math_reward/std": 0.46912387013435364, "sampling/importance_sampling_ratio/max": 2.9273016452789307, "sampling/importance_sampling_ratio/mean": 0.783342719078064, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5891077518463135, "sampling/sampling_logp_difference/mean": 0.013085336424410343, "step": 224, "step_time": 30.409944667015225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 730.2678833007812, "completions/mean_terminated_length": 477.9361572265625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.8285171985626221, "epoch": 0.21047708138447146, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.30124127864837646, "kl": 0.025614875834435225, "learning_rate": 7.904583723105707e-06, "loss": 0.0648, "num_tokens": 37484268.0, "reward": 0.6071428656578064, "reward_std": 0.4905805289745331, "rewards/math_reward/mean": 0.6071428656578064, "rewards/math_reward/std": 0.4905804991722107, "sampling/importance_sampling_ratio/max": 2.822965145111084, "sampling/importance_sampling_ratio/mean": 0.80544513463974, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49129390716552734, "sampling/sampling_logp_difference/mean": 0.015872595831751823, "step": 225, "step_time": 30.192544142948464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 794.7589721679688, "completions/mean_terminated_length": 396.67059326171875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.7172247022390366, "epoch": 0.21141253507951357, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.2119993269443512, "kl": 0.027045411057770252, "learning_rate": 7.895229186155285e-06, "loss": -0.0487, "num_tokens": 37655705.0, "reward": 0.6964285969734192, "reward_std": 0.4618662893772125, "rewards/math_reward/mean": 0.6964285969734192, "rewards/math_reward/std": 0.4618663191795349, "sampling/importance_sampling_ratio/max": 2.2397208213806152, "sampling/importance_sampling_ratio/mean": 0.7730112075805664, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8136630058288574, "sampling/sampling_logp_difference/mean": 0.014084048569202423, "step": 226, "step_time": 32.774031535722315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 670.5089721679688, "completions/mean_terminated_length": 371.0543518066406, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.7171078324317932, "epoch": 0.21234798877455566, "frac_reward_zero_std": 0.5, "grad_norm": 0.2971091568470001, "kl": 0.02897025877609849, "learning_rate": 7.885874649204866e-06, "loss": 0.0236, "num_tokens": 37820546.0, "reward": 0.6785714626312256, "reward_std": 0.46912387013435364, "rewards/math_reward/mean": 0.6785714030265808, "rewards/math_reward/std": 0.46912387013435364, "sampling/importance_sampling_ratio/max": 2.2740933895111084, "sampling/importance_sampling_ratio/mean": 0.7586073279380798, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4154837131500244, "sampling/sampling_logp_difference/mean": 0.01596171408891678, "step": 227, "step_time": 31.128185395151377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 721.3392944335938, "completions/mean_terminated_length": 483.9368591308594, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.687818318605423, "epoch": 0.21328344246959777, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3431384861469269, "kl": 0.023443395737558603, "learning_rate": 7.876520112254444e-06, "loss": 0.1175, "num_tokens": 37978144.0, "reward": 0.625, "reward_std": 0.4862987697124481, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.820946455001831, "sampling/importance_sampling_ratio/mean": 0.7576214671134949, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.468416690826416, "sampling/sampling_logp_difference/mean": 0.013757895678281784, "step": 228, "step_time": 30.175523706013337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 794.4375610351562, "completions/mean_terminated_length": 554.3936157226562, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.6269289404153824, "epoch": 0.21421889616463985, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.2178371399641037, "kl": 0.023604675196111202, "learning_rate": 7.867165575304023e-06, "loss": -0.0429, "num_tokens": 38142561.0, "reward": 0.455357164144516, "reward_std": 0.5002412796020508, "rewards/math_reward/mean": 0.4553571343421936, "rewards/math_reward/std": 0.5002412796020508, "sampling/importance_sampling_ratio/max": 2.3422508239746094, "sampling/importance_sampling_ratio/mean": 0.8219044804573059, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5313522815704346, "sampling/sampling_logp_difference/mean": 0.013486770913004875, "step": 229, "step_time": 30.649162457091734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 595.5357666015625, "completions/mean_terminated_length": 437.3465270996094, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.5398378744721413, "epoch": 0.21515434985968196, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.507806658744812, "kl": 0.024334358051419258, "learning_rate": 7.857811038353602e-06, "loss": 0.1041, "num_tokens": 38289637.0, "reward": 0.6428571939468384, "reward_std": 0.4813109338283539, "rewards/math_reward/mean": 0.6428571343421936, "rewards/math_reward/std": 0.48131096363067627, "sampling/importance_sampling_ratio/max": 2.8155019283294678, "sampling/importance_sampling_ratio/mean": 0.887072741985321, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49428749084472656, "sampling/sampling_logp_difference/mean": 0.011935004964470863, "step": 230, "step_time": 27.857081771828234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 896.5714721679688, "completions/mean_terminated_length": 615.1111450195312, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.6840531677007675, "epoch": 0.21608980355472404, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2676672041416168, "kl": 0.02388938469812274, "learning_rate": 7.848456501403182e-06, "loss": -0.0871, "num_tokens": 38469765.0, "reward": 0.4107142984867096, "reward_std": 0.4941745698451996, "rewards/math_reward/mean": 0.4107142984867096, "rewards/math_reward/std": 0.4941745698451996, "sampling/importance_sampling_ratio/max": 2.8314921855926514, "sampling/importance_sampling_ratio/mean": 0.7584066987037659, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6057376861572266, "sampling/sampling_logp_difference/mean": 0.013479799032211304, "step": 231, "step_time": 33.123190568294376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 763.7500610351562, "completions/mean_terminated_length": 394.712646484375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.7552548572421074, "epoch": 0.21702525724976612, "frac_reward_zero_std": 0.5, "grad_norm": 0.3675174117088318, "kl": 0.02361208526417613, "learning_rate": 7.83910196445276e-06, "loss": 0.1144, "num_tokens": 38636753.0, "reward": 0.6071428656578064, "reward_std": 0.4905804991722107, "rewards/math_reward/mean": 0.6071428656578064, "rewards/math_reward/std": 0.4905804991722107, "sampling/importance_sampling_ratio/max": 2.9900362491607666, "sampling/importance_sampling_ratio/mean": 0.7620573043823242, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5661635398864746, "sampling/sampling_logp_difference/mean": 0.01418176107108593, "step": 232, "step_time": 32.48493041913025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 709.6250610351562, "completions/mean_terminated_length": 363.7528076171875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.7456820458173752, "epoch": 0.21796071094480823, "frac_reward_zero_std": 0.5, "grad_norm": 0.2785572409629822, "kl": 0.024259990081191063, "learning_rate": 7.829747427502339e-06, "loss": 0.0898, "num_tokens": 38799071.0, "reward": 0.7053571939468384, "reward_std": 0.45793095231056213, "rewards/math_reward/mean": 0.7053571343421936, "rewards/math_reward/std": 0.45793095231056213, "sampling/importance_sampling_ratio/max": 2.695263624191284, "sampling/importance_sampling_ratio/mean": 0.8066477179527283, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4379836320877075, "sampling/sampling_logp_difference/mean": 0.015447668731212616, "step": 233, "step_time": 30.471192217897624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 787.1517944335938, "completions/mean_terminated_length": 461.3146057128906, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.6967739090323448, "epoch": 0.21889616463985032, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.2891371250152588, "kl": 0.026082098949700594, "learning_rate": 7.820392890551918e-06, "loss": 0.0621, "num_tokens": 38969320.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.6585211753845215, "sampling/importance_sampling_ratio/mean": 0.7645546793937683, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9996612071990967, "sampling/sampling_logp_difference/mean": 0.014275161549448967, "step": 234, "step_time": 31.220625536050647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 621.3303833007812, "completions/mean_terminated_length": 383.5520935058594, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5523120984435081, "epoch": 0.21983161833489243, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.5001733899116516, "kl": 0.024231742601841688, "learning_rate": 7.811038353601498e-06, "loss": 0.0794, "num_tokens": 39120005.0, "reward": 0.7410714626312256, "reward_std": 0.4400150775909424, "rewards/math_reward/mean": 0.7410714030265808, "rewards/math_reward/std": 0.44001504778862, "sampling/importance_sampling_ratio/max": 2.7239267826080322, "sampling/importance_sampling_ratio/mean": 0.7918099164962769, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5686140060424805, "sampling/sampling_logp_difference/mean": 0.011931848712265491, "step": 235, "step_time": 28.1768031779211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 725.5000610351562, "completions/mean_terminated_length": 505.0833435058594, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.6444588899612427, "epoch": 0.2207670720299345, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.3082082271575928, "kl": 0.023106816224753857, "learning_rate": 7.801683816651077e-06, "loss": 0.0001, "num_tokens": 39283853.0, "reward": 0.5625, "reward_std": 0.4983079731464386, "rewards/math_reward/mean": 0.5625, "rewards/math_reward/std": 0.4983079433441162, "sampling/importance_sampling_ratio/max": 2.676107883453369, "sampling/importance_sampling_ratio/mean": 0.7768716812133789, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4657282829284668, "sampling/sampling_logp_difference/mean": 0.013097940012812614, "step": 236, "step_time": 30.306598151801154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 856.8392944335938, "completions/mean_terminated_length": 514.5516967773438, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.4743339568376541, "epoch": 0.22170252572497662, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.28368237614631653, "kl": 0.021945536602288485, "learning_rate": 7.792329279700656e-06, "loss": 0.0083, "num_tokens": 39458059.0, "reward": 0.455357164144516, "reward_std": 0.5002412796020508, "rewards/math_reward/mean": 0.4553571343421936, "rewards/math_reward/std": 0.5002412796020508, "sampling/importance_sampling_ratio/max": 2.3930156230926514, "sampling/importance_sampling_ratio/mean": 0.7956547141075134, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4583010673522949, "sampling/sampling_logp_difference/mean": 0.010974793694913387, "step": 237, "step_time": 32.0192437984515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 808.1517944335938, "completions/mean_terminated_length": 487.7415771484375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.7037905752658844, "epoch": 0.2226379794200187, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.2179689258337021, "kl": 0.025488819926977158, "learning_rate": 7.782974742750234e-06, "loss": -0.0584, "num_tokens": 39630524.0, "reward": 0.6160714626312256, "reward_std": 0.48852667212486267, "rewards/math_reward/mean": 0.6160714030265808, "rewards/math_reward/std": 0.4885266423225403, "sampling/importance_sampling_ratio/max": 2.420788288116455, "sampling/importance_sampling_ratio/mean": 0.7620744705200195, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5279102921485901, "sampling/sampling_logp_difference/mean": 0.014093409292399883, "step": 238, "step_time": 31.770274584181607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 623.7767944335938, "completions/mean_terminated_length": 368.9158020019531, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.7397265285253525, "epoch": 0.2235734331150608, "frac_reward_zero_std": 0.5, "grad_norm": 0.30933648347854614, "kl": 0.023352894466370344, "learning_rate": 7.773620205799813e-06, "loss": -0.0025, "num_tokens": 39784411.0, "reward": 0.7946428656578064, "reward_std": 0.4057779312133789, "rewards/math_reward/mean": 0.7946428656578064, "rewards/math_reward/std": 0.4057779014110565, "sampling/importance_sampling_ratio/max": 2.5435516834259033, "sampling/importance_sampling_ratio/mean": 0.8086775541305542, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.45995044708251953, "sampling/sampling_logp_difference/mean": 0.01510409265756607, "step": 239, "step_time": 29.067961735883728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 769.794677734375, "completions/mean_terminated_length": 474.8241882324219, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.615973487496376, "epoch": 0.2245088868101029, "frac_reward_zero_std": 0.5, "grad_norm": 0.26706579327583313, "kl": 0.022938521578907967, "learning_rate": 7.764265668849393e-06, "loss": 0.0548, "num_tokens": 39949604.0, "reward": 0.660714328289032, "reward_std": 0.4755948781967163, "rewards/math_reward/mean": 0.6607142686843872, "rewards/math_reward/std": 0.4755948781967163, "sampling/importance_sampling_ratio/max": 2.922107458114624, "sampling/importance_sampling_ratio/mean": 0.7505427002906799, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5277410745620728, "sampling/sampling_logp_difference/mean": 0.01272582821547985, "step": 240, "step_time": 30.994826320093125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 688.3303833007812, "completions/mean_terminated_length": 461.71875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.7354798018932343, "epoch": 0.225444340505145, "frac_reward_zero_std": 0.5, "grad_norm": 0.3500584363937378, "kl": 0.02601192006841302, "learning_rate": 7.75491113189897e-06, "loss": 0.0772, "num_tokens": 40104921.0, "reward": 0.5803571939468384, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.5803571343421936, "rewards/math_reward/std": 0.49571847915649414, "sampling/importance_sampling_ratio/max": 2.676591396331787, "sampling/importance_sampling_ratio/mean": 0.8031937479972839, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48412442207336426, "sampling/sampling_logp_difference/mean": 0.013606213964521885, "step": 241, "step_time": 29.614836202934384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 635.8392944335938, "completions/mean_terminated_length": 347.3333435058594, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.7009027600288391, "epoch": 0.22637979420018708, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2705008387565613, "kl": 0.021749537903815508, "learning_rate": 7.745556594948551e-06, "loss": -0.0587, "num_tokens": 40260287.0, "reward": 0.848214328289032, "reward_std": 0.3604257106781006, "rewards/math_reward/mean": 0.8482142686843872, "rewards/math_reward/std": 0.3604257106781006, "sampling/importance_sampling_ratio/max": 2.7452926635742188, "sampling/importance_sampling_ratio/mean": 0.8514882326126099, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4951162338256836, "sampling/sampling_logp_difference/mean": 0.013769869692623615, "step": 242, "step_time": 29.13138201413676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.133928582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 734.0803833007812, "completions/mean_terminated_length": 530.8969116210938, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.5010496973991394, "epoch": 0.2273152478952292, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3490602374076843, "kl": 0.02285487111657858, "learning_rate": 7.73620205799813e-06, "loss": 0.0123, "num_tokens": 40424168.0, "reward": 0.6071428656578064, "reward_std": 0.4905805289745331, "rewards/math_reward/mean": 0.6071428656578064, "rewards/math_reward/std": 0.4905804991722107, "sampling/importance_sampling_ratio/max": 2.744675636291504, "sampling/importance_sampling_ratio/mean": 0.9448047876358032, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7083321809768677, "sampling/sampling_logp_difference/mean": 0.011830237694084644, "step": 243, "step_time": 30.151185810333118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 902.4464721679688, "completions/mean_terminated_length": 502.1927490234375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.5795765370130539, "epoch": 0.22825070159027128, "frac_reward_zero_std": 0.6428571939468384, "grad_norm": 0.17917385697364807, "kl": 0.01998627372086048, "learning_rate": 7.726847521047708e-06, "loss": 0.0491, "num_tokens": 40604098.0, "reward": 0.5267857313156128, "reward_std": 0.5015259981155396, "rewards/math_reward/mean": 0.5267857313156128, "rewards/math_reward/std": 0.5015259385108948, "sampling/importance_sampling_ratio/max": 2.3758718967437744, "sampling/importance_sampling_ratio/mean": 0.646510660648346, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5699691772460938, "sampling/sampling_logp_difference/mean": 0.013218754902482033, "step": 244, "step_time": 34.485998986056075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 855.7500610351562, "completions/mean_terminated_length": 458.3333435058594, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.7355019897222519, "epoch": 0.2291861552853134, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.17384706437587738, "kl": 0.020941502414643764, "learning_rate": 7.717492984097288e-06, "loss": 0.0152, "num_tokens": 40780766.0, "reward": 0.625, "reward_std": 0.4862987995147705, "rewards/math_reward/mean": 0.625, "rewards/math_reward/std": 0.4862987697124481, "sampling/importance_sampling_ratio/max": 2.5503809452056885, "sampling/importance_sampling_ratio/mean": 0.6846118569374084, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5538830757141113, "sampling/sampling_logp_difference/mean": 0.013645053841173649, "step": 245, "step_time": 33.72171409497969 }, { "clip_ratio/high_max": 0.001118128770031035, "clip_ratio/high_mean": 0.00015973267727531493, "clip_ratio/low_mean": 2.8286761335039046e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018801943770085927, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 856.0089721679688, "completions/mean_terminated_length": 580.93408203125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.5612276941537857, "epoch": 0.23012160898035547, "frac_reward_zero_std": 0.5, "grad_norm": 0.31532102823257446, "kl": 0.02060764655470848, "learning_rate": 7.708138447146867e-06, "loss": -0.0005, "num_tokens": 40952383.0, "reward": 0.4196428656578064, "reward_std": 0.49571847915649414, "rewards/math_reward/mean": 0.4196428656578064, "rewards/math_reward/std": 0.49571844935417175, "sampling/importance_sampling_ratio/max": 2.8500428199768066, "sampling/importance_sampling_ratio/mean": 0.699031412601471, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47576260566711426, "sampling/sampling_logp_difference/mean": 0.012264827266335487, "step": 246, "step_time": 31.50224802014418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 748.7053833007812, "completions/mean_terminated_length": 375.3448181152344, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.6087137535214424, "epoch": 0.23105706267539758, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2648636996746063, "kl": 0.020661357790231705, "learning_rate": 7.698783910196445e-06, "loss": 0.0475, "num_tokens": 41113750.0, "reward": 0.7589285969734192, "reward_std": 0.4296559691429138, "rewards/math_reward/mean": 0.7589285969734192, "rewards/math_reward/std": 0.4296559691429138, "sampling/importance_sampling_ratio/max": 2.8000757694244385, "sampling/importance_sampling_ratio/mean": 0.8039107918739319, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5089101195335388, "sampling/sampling_logp_difference/mean": 0.012704139575362206, "step": 247, "step_time": 30.46202869620174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 823.9732666015625, "completions/mean_terminated_length": 453.9186096191406, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.595280334353447, "epoch": 0.23199251637043966, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.23225237429141998, "kl": 0.02190704084932804, "learning_rate": 7.689429373246024e-06, "loss": -0.0061, "num_tokens": 41283547.0, "reward": 0.5892857313156128, "reward_std": 0.4941745698451996, "rewards/math_reward/mean": 0.5892857313156128, "rewards/math_reward/std": 0.4941745698451996, "sampling/importance_sampling_ratio/max": 2.8424742221832275, "sampling/importance_sampling_ratio/mean": 0.8089728951454163, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1794288158416748, "sampling/sampling_logp_difference/mean": 0.011653567664325237, "step": 248, "step_time": 31.846554692136124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2767857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 885.3750610351562, "completions/mean_terminated_length": 440.4197692871094, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.8774009943008423, "epoch": 0.23292797006548177, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.21502819657325745, "kl": 0.023111009504646063, "learning_rate": 7.680074836295604e-06, "loss": 0.0156, "num_tokens": 41466677.0, "reward": 0.5714285969734192, "reward_std": 0.49709582328796387, "rewards/math_reward/mean": 0.5714285969734192, "rewards/math_reward/std": 0.4970957934856415, "sampling/importance_sampling_ratio/max": 2.7306923866271973, "sampling/importance_sampling_ratio/mean": 0.7436543107032776, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4931039810180664, "sampling/sampling_logp_difference/mean": 0.015262868255376816, "step": 249, "step_time": 34.24733294406906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 862.8928833007812, "completions/mean_terminated_length": 504.6046447753906, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.692407101392746, "epoch": 0.23386342376052385, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.30146998167037964, "kl": 0.022079723421484232, "learning_rate": 7.670720299345183e-06, "loss": -0.0332, "num_tokens": 41644329.0, "reward": 0.598214328289032, "reward_std": 0.49246248602867126, "rewards/math_reward/mean": 0.5982142686843872, "rewards/math_reward/std": 0.49246248602867126, "sampling/importance_sampling_ratio/max": 2.7453267574310303, "sampling/importance_sampling_ratio/mean": 0.7214359641075134, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.158416509628296, "sampling/sampling_logp_difference/mean": 0.012606189586222172, "step": 250, "step_time": 33.20607776194811 } ], "logging_steps": 1, "max_steps": 1069, "num_input_tokens_seen": 41644329, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }