{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.005, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1178.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 1068.8125, "completions/mean_terminated_length": 1068.8125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.14827869273722172, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.5875381827354431, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0258, "num_tokens": 60931.0, "reward": -0.5099351406097412, "reward_std": 0.14631026983261108, "rewards/rollout_reward_func/mean": -0.5099351406097412, "rewards/rollout_reward_func/std": 0.14447812736034393, "sampling/importance_sampling_ratio/max": 1.571394443511963, "sampling/importance_sampling_ratio/mean": 0.9980878829956055, "sampling/importance_sampling_ratio/min": 0.5769830346107483, "sampling/sampling_logp_difference/max": 0.5499424934387207, "sampling/sampling_logp_difference/mean": 0.014445322565734386, "step": 1, "step_time": 20.571737493999535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.14827869273722172, "epoch": 2e-05, "grad_norm": 0.5879868268966675, "kl": 0.0, "learning_rate": 1.4e-06, "loss": -0.0258, "step": 2, "step_time": 4.626672280999628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1213.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 1112.34375, "completions/mean_terminated_length": 1112.34375, "completions/min_length": 1027.0, "completions/min_terminated_length": 1027.0, "entropy": 0.1730620078742504, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.6324847340583801, "kl": 0.0015527242940152064, "learning_rate": 2.8e-06, "loss": -0.0031, "num_tokens": 123719.0, "reward": -0.5000399947166443, "reward_std": 0.20460031926631927, "rewards/rollout_reward_func/mean": -0.5000399947166443, "rewards/rollout_reward_func/std": 0.21835429966449738, "sampling/importance_sampling_ratio/max": 1.846315622329712, "sampling/importance_sampling_ratio/mean": 1.0073394775390625, "sampling/importance_sampling_ratio/min": 0.6440466046333313, "sampling/sampling_logp_difference/max": 0.613192081451416, "sampling/sampling_logp_difference/mean": 0.025099236518144608, "step": 3, "step_time": 19.98843026600207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006547619355842471, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006547619355842471, "entropy": 0.1720133889466524, "epoch": 4e-05, "grad_norm": 0.6582677960395813, "kl": 0.0018092537175107282, "learning_rate": 4.2e-06, "loss": -0.0027, "step": 4, "step_time": 5.736385023999901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 1079.75, "completions/mean_terminated_length": 1079.75, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "entropy": 0.18103219754993916, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.1121903657913208, "kl": 0.0014489683271676768, "learning_rate": 5.6e-06, "loss": 0.0012, "num_tokens": 184805.0, "reward": -0.4433603286743164, "reward_std": 0.2174307256937027, "rewards/rollout_reward_func/mean": -0.4433603286743164, "rewards/rollout_reward_func/std": 0.23637622594833374, "sampling/importance_sampling_ratio/max": 2.4048261642456055, "sampling/importance_sampling_ratio/mean": 1.0075390338897705, "sampling/importance_sampling_ratio/min": 0.7123521566390991, "sampling/sampling_logp_difference/max": 0.8774776458740234, "sampling/sampling_logp_difference/mean": 0.021090637892484665, "step": 5, "step_time": 20.38773999299883 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.004315476398915052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006398809840902686, "entropy": 0.17933203652501106, "epoch": 6e-05, "grad_norm": 0.6975399255752563, "kl": 0.002128426371200476, "learning_rate": 7e-06, "loss": 0.0018, "step": 6, "step_time": 4.544671104998997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 1101.0, "completions/mean_terminated_length": 1101.0, "completions/min_length": 1018.0, "completions/min_terminated_length": 1018.0, "entropy": 0.16611213609576225, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.7441306710243225, "kl": 0.0018047343110083602, "learning_rate": 8.4e-06, "loss": 0.0058, "num_tokens": 247532.0, "reward": -0.49166715145111084, "reward_std": 0.19860149919986725, "rewards/rollout_reward_func/mean": -0.49166715145111084, "rewards/rollout_reward_func/std": 0.21013204753398895, "sampling/importance_sampling_ratio/max": 1.4890519380569458, "sampling/importance_sampling_ratio/mean": 0.9949239492416382, "sampling/importance_sampling_ratio/min": 0.6358773708343506, "sampling/sampling_logp_difference/max": 0.45274949073791504, "sampling/sampling_logp_difference/mean": 0.019934551790356636, "step": 7, "step_time": 19.994659061996572 }, { "clip_ratio/high_max": 0.017868546303361654, "clip_ratio/high_mean": 0.008934273151680827, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008934273151680827, "entropy": 0.16904153488576412, "epoch": 8e-05, "grad_norm": 0.7918809652328491, "kl": 0.0015877512742008548, "learning_rate": 9.8e-06, "loss": 0.0075, "step": 8, "step_time": 4.633046563001699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1187.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 1114.96875, "completions/mean_terminated_length": 1114.96875, "completions/min_length": 1027.0, "completions/min_terminated_length": 1027.0, "entropy": 0.1565935993567109, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.0528326034545898, "kl": 0.003487051093543414, "learning_rate": 1.12e-05, "loss": 0.0003, "num_tokens": 310205.0, "reward": -0.592670202255249, "reward_std": 0.17108096182346344, "rewards/rollout_reward_func/mean": -0.592670202255249, "rewards/rollout_reward_func/std": 0.17350725829601288, "sampling/importance_sampling_ratio/max": 1.3040456771850586, "sampling/importance_sampling_ratio/mean": 1.0003635883331299, "sampling/importance_sampling_ratio/min": 0.35866811871528625, "sampling/sampling_logp_difference/max": 1.0253578424453735, "sampling/sampling_logp_difference/mean": 0.017204465344548225, "step": 9, "step_time": 20.192287416995896 }, { "clip_ratio/high_max": 0.01278735650703311, "clip_ratio/high_mean": 0.006393678253516555, "clip_ratio/low_mean": 0.008625821210443974, "clip_ratio/low_min": 0.004310344811528921, "clip_ratio/region_mean": 0.015019499463960528, "entropy": 0.1551358327269554, "epoch": 0.0001, "grad_norm": 0.8208324313163757, "kl": 0.003083013005380053, "learning_rate": 1.2599999999999998e-05, "loss": 0.0, "step": 10, "step_time": 4.6048961259984935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 1117.59375, "completions/mean_terminated_length": 1117.59375, "completions/min_length": 1024.0, "completions/min_terminated_length": 1024.0, "entropy": 0.14400858711451292, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.7148432731628418, "kl": 0.003160052772727795, "learning_rate": 1.4e-05, "loss": -0.0004, "num_tokens": 372953.0, "reward": -0.5451383590698242, "reward_std": 0.17130792140960693, "rewards/rollout_reward_func/mean": -0.5451383590698242, "rewards/rollout_reward_func/std": 0.19911673665046692, "sampling/importance_sampling_ratio/max": 1.359271764755249, "sampling/importance_sampling_ratio/mean": 0.9992703199386597, "sampling/importance_sampling_ratio/min": 0.5106638073921204, "sampling/sampling_logp_difference/max": 0.6720438003540039, "sampling/sampling_logp_difference/mean": 0.015419408679008484, "step": 11, "step_time": 19.467455764000988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010791826527565718, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010791826527565718, "entropy": 0.14205167535692453, "epoch": 0.00012, "grad_norm": 0.3848305642604828, "kl": 0.0060716886655427516, "learning_rate": 1.5399999999999998e-05, "loss": -0.002, "step": 12, "step_time": 4.658318636997137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 1096.0625, "completions/mean_terminated_length": 1096.0625, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "entropy": 0.18431206047534943, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 0.7360744476318359, "kl": 0.013301395112648606, "learning_rate": 1.68e-05, "loss": -0.0023, "num_tokens": 435496.0, "reward": -0.5159348249435425, "reward_std": 0.24927425384521484, "rewards/rollout_reward_func/mean": -0.5159348249435425, "rewards/rollout_reward_func/std": 0.25868040323257446, "sampling/importance_sampling_ratio/max": 1.4742515087127686, "sampling/importance_sampling_ratio/mean": 0.9967504739761353, "sampling/importance_sampling_ratio/min": 0.26180464029312134, "sampling/sampling_logp_difference/max": 1.3401566743850708, "sampling/sampling_logp_difference/mean": 0.02116217091679573, "step": 13, "step_time": 20.27689285100314 }, { "clip_ratio/high_max": 0.018284704070538282, "clip_ratio/high_mean": 0.013861013110727072, "clip_ratio/low_mean": 0.010935504455119371, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024796517565846443, "entropy": 0.18203460797667503, "epoch": 0.00014, "grad_norm": 0.6735957860946655, "kl": 0.024826159758958966, "learning_rate": 1.82e-05, "loss": -0.0019, "step": 14, "step_time": 4.5899165110022295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 1113.34375, "completions/mean_terminated_length": 1113.34375, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "entropy": 0.1684763329103589, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.8105282187461853, "kl": 0.0260606175288558, "learning_rate": 1.96e-05, "loss": 0.0022, "num_tokens": 498326.0, "reward": -0.5362461805343628, "reward_std": 0.20841655135154724, "rewards/rollout_reward_func/mean": -0.5362461805343628, "rewards/rollout_reward_func/std": 0.23026782274246216, "sampling/importance_sampling_ratio/max": 1.7616028785705566, "sampling/importance_sampling_ratio/mean": 0.9983079433441162, "sampling/importance_sampling_ratio/min": 0.47773033380508423, "sampling/sampling_logp_difference/max": 0.7387088537216187, "sampling/sampling_logp_difference/mean": 0.025270570069551468, "step": 15, "step_time": 19.476561630001015 }, { "clip_ratio/high_max": 0.012941297609359026, "clip_ratio/high_mean": 0.008553982246667147, "clip_ratio/low_mean": 0.011011905036866665, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01956588728353381, "entropy": 0.16627341881394386, "epoch": 0.00016, "grad_norm": 0.7380857467651367, "kl": 0.03526182088535279, "learning_rate": 2.1e-05, "loss": 0.0001, "step": 16, "step_time": 4.61911116200281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 1114.09375, "completions/mean_terminated_length": 1114.09375, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "entropy": 0.1819753721356392, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.6300584077835083, "kl": 0.042976281489245594, "learning_rate": 2.24e-05, "loss": -0.0039, "num_tokens": 561355.0, "reward": -0.48456522822380066, "reward_std": 0.22390829026699066, "rewards/rollout_reward_func/mean": -0.48456522822380066, "rewards/rollout_reward_func/std": 0.23524193465709686, "sampling/importance_sampling_ratio/max": 1.670470118522644, "sampling/importance_sampling_ratio/mean": 0.9951282739639282, "sampling/importance_sampling_ratio/min": 0.4862470328807831, "sampling/sampling_logp_difference/max": 0.7210384607315063, "sampling/sampling_logp_difference/mean": 0.026350775733590126, "step": 17, "step_time": 19.582436463004342 }, { "clip_ratio/high_max": 0.01696428656578064, "clip_ratio/high_mean": 0.010637315455824137, "clip_ratio/low_mean": 0.010957828490063548, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02159514371305704, "entropy": 0.18023785762488842, "epoch": 0.00018, "grad_norm": 0.5049033761024475, "kl": 0.08271767722908407, "learning_rate": 2.38e-05, "loss": -0.0058, "step": 18, "step_time": 5.156734605005113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1175.0, "completions/max_terminated_length": 1175.0, "completions/mean_length": 1104.75, "completions/mean_terminated_length": 1104.75, "completions/min_length": 1026.0, "completions/min_terminated_length": 1026.0, "entropy": 0.1767545472830534, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.7095220685005188, "kl": 0.1111021441174671, "learning_rate": 2.5199999999999996e-05, "loss": -0.0077, "num_tokens": 623966.0, "reward": -0.4877975881099701, "reward_std": 0.22178685665130615, "rewards/rollout_reward_func/mean": -0.4877975881099701, "rewards/rollout_reward_func/std": 0.21573103964328766, "sampling/importance_sampling_ratio/max": 1.5042498111724854, "sampling/importance_sampling_ratio/mean": 1.0011495351791382, "sampling/importance_sampling_ratio/min": 0.3050249516963959, "sampling/sampling_logp_difference/max": 1.187361717224121, "sampling/sampling_logp_difference/mean": 0.026292257010936737, "step": 19, "step_time": 19.523712756998066 }, { "clip_ratio/high_max": 0.03435960691422224, "clip_ratio/high_mean": 0.01717980345711112, "clip_ratio/low_mean": 0.022261562990024686, "clip_ratio/low_min": 0.00893997447565198, "clip_ratio/region_mean": 0.03944136598147452, "entropy": 0.18229309655725956, "epoch": 0.0002, "grad_norm": 0.6555894613265991, "kl": 0.1375966964988038, "learning_rate": 2.66e-05, "loss": -0.0096, "step": 20, "step_time": 4.624744021999504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1175.0, "completions/max_terminated_length": 1175.0, "completions/mean_length": 1083.6875, "completions/mean_terminated_length": 1083.6875, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "entropy": 0.17547390051186085, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.7718576788902283, "kl": 0.12620768509805202, "learning_rate": 2.8e-05, "loss": -0.0036, "num_tokens": 685791.0, "reward": -0.5128062963485718, "reward_std": 0.17080315947532654, "rewards/rollout_reward_func/mean": -0.5128062963485718, "rewards/rollout_reward_func/std": 0.18223346769809723, "sampling/importance_sampling_ratio/max": 1.7905471324920654, "sampling/importance_sampling_ratio/mean": 0.9957761764526367, "sampling/importance_sampling_ratio/min": 0.3293769359588623, "sampling/sampling_logp_difference/max": 1.110552430152893, "sampling/sampling_logp_difference/mean": 0.029262032359838486, "step": 21, "step_time": 19.781279265000194 }, { "clip_ratio/high_max": 0.008477011695504189, "clip_ratio/high_mean": 0.004238505847752094, "clip_ratio/low_mean": 0.023578613065183163, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027817118912935257, "entropy": 0.1772155137732625, "epoch": 0.00022, "grad_norm": 0.6299952864646912, "kl": 0.26611568219959736, "learning_rate": 2.9399999999999996e-05, "loss": -0.0065, "step": 22, "step_time": 4.579408528998101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 1085.9375, "completions/mean_terminated_length": 1085.9375, "completions/min_length": 1022.0, "completions/min_terminated_length": 1022.0, "entropy": 0.19030261412262917, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 0.558867335319519, "kl": 0.10519878612831235, "learning_rate": 3.0799999999999996e-05, "loss": 0.0011, "num_tokens": 747366.0, "reward": -0.46695220470428467, "reward_std": 0.17308293282985687, "rewards/rollout_reward_func/mean": -0.46695220470428467, "rewards/rollout_reward_func/std": 0.19205814599990845, "sampling/importance_sampling_ratio/max": 1.70639967918396, "sampling/importance_sampling_ratio/mean": 1.000931978225708, "sampling/importance_sampling_ratio/min": 0.3715561032295227, "sampling/sampling_logp_difference/max": 0.9900554418563843, "sampling/sampling_logp_difference/mean": 0.024180065840482712, "step": 23, "step_time": 21.199171855001623 }, { "clip_ratio/high_max": 0.008477011695504189, "clip_ratio/high_mean": 0.004238505847752094, "clip_ratio/low_mean": 0.0236401897855103, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.027878695633262396, "entropy": 0.18348873034119606, "epoch": 0.00024, "grad_norm": 0.40123236179351807, "kl": 0.1714035333134234, "learning_rate": 3.22e-05, "loss": -0.0017, "step": 24, "step_time": 4.537545453998973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 1105.5625, "completions/mean_terminated_length": 1105.5625, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "entropy": 0.18742740340530872, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 0.5812861323356628, "kl": 0.08350833132863045, "learning_rate": 3.36e-05, "loss": -0.0105, "num_tokens": 809144.0, "reward": -0.5143345594406128, "reward_std": 0.20476500689983368, "rewards/rollout_reward_func/mean": -0.5143345594406128, "rewards/rollout_reward_func/std": 0.2123231291770935, "sampling/importance_sampling_ratio/max": 1.503713846206665, "sampling/importance_sampling_ratio/mean": 0.994086742401123, "sampling/importance_sampling_ratio/min": 0.542472779750824, "sampling/sampling_logp_difference/max": 0.6116174459457397, "sampling/sampling_logp_difference/mean": 0.022150132805109024, "step": 25, "step_time": 19.470972789997177 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.006250000325962901, "clip_ratio/low_mean": 0.008405172731727362, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014655173057690263, "entropy": 0.18047262355685234, "epoch": 0.00026, "grad_norm": 0.4743516147136688, "kl": 0.17200348246842623, "learning_rate": 3.5e-05, "loss": -0.0127, "step": 26, "step_time": 4.664985343002627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 1110.1875, "completions/mean_terminated_length": 1110.1875, "completions/min_length": 1032.0, "completions/min_terminated_length": 1032.0, "entropy": 0.17882953397929668, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 0.9188854098320007, "kl": 0.23274803906679153, "learning_rate": 3.64e-05, "loss": 0.002, "num_tokens": 871007.0, "reward": -0.4795309007167816, "reward_std": 0.22121167182922363, "rewards/rollout_reward_func/mean": -0.4795309007167816, "rewards/rollout_reward_func/std": 0.2276451587677002, "sampling/importance_sampling_ratio/max": 2.5288777351379395, "sampling/importance_sampling_ratio/mean": 1.0021171569824219, "sampling/importance_sampling_ratio/min": 0.36609092354774475, "sampling/sampling_logp_difference/max": 1.004873514175415, "sampling/sampling_logp_difference/mean": 0.0353076308965683, "step": 27, "step_time": 19.870946938000998 }, { "clip_ratio/high_max": 0.013095238711684942, "clip_ratio/high_mean": 0.008702791761606932, "clip_ratio/low_mean": 0.021120690274983644, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.029823482735082507, "entropy": 0.17198532819747925, "epoch": 0.00028, "grad_norm": 0.6592692732810974, "kl": 0.3384090745821595, "learning_rate": 3.78e-05, "loss": -0.0007, "step": 28, "step_time": 5.686317739000515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 1094.53125, "completions/mean_terminated_length": 1094.53125, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "entropy": 0.15638047084212303, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.5733070969581604, "kl": 0.10902313888072968, "learning_rate": 3.92e-05, "loss": 0.0005, "num_tokens": 932280.0, "reward": -0.5243247747421265, "reward_std": 0.14173707365989685, "rewards/rollout_reward_func/mean": -0.5243247747421265, "rewards/rollout_reward_func/std": 0.1486651599407196, "sampling/importance_sampling_ratio/max": 1.8691412210464478, "sampling/importance_sampling_ratio/mean": 0.9974087476730347, "sampling/importance_sampling_ratio/min": 0.550480842590332, "sampling/sampling_logp_difference/max": 0.6254791021347046, "sampling/sampling_logp_difference/mean": 0.023167556151747704, "step": 29, "step_time": 19.768804103001457 }, { "clip_ratio/high_max": 0.016810345463454723, "clip_ratio/high_mean": 0.008405172731727362, "clip_ratio/low_mean": 0.016954023856669664, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.025359196588397026, "entropy": 0.14598691277205944, "epoch": 0.0003, "grad_norm": 0.28303226828575134, "kl": 0.12301230430603027, "learning_rate": 4.059999999999999e-05, "loss": -0.0025, "step": 30, "step_time": 4.596348783999929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 1106.5, "completions/mean_terminated_length": 1106.5, "completions/min_length": 1020.0, "completions/min_terminated_length": 1020.0, "entropy": 0.15545653365552425, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 0.559589684009552, "kl": 0.14759081695228815, "learning_rate": 4.2e-05, "loss": 0.0043, "num_tokens": 994132.0, "reward": -0.4605565667152405, "reward_std": 0.2054065763950348, "rewards/rollout_reward_func/mean": -0.4605565667152405, "rewards/rollout_reward_func/std": 0.2041986882686615, "sampling/importance_sampling_ratio/max": 2.1738381385803223, "sampling/importance_sampling_ratio/mean": 1.0003693103790283, "sampling/importance_sampling_ratio/min": 0.3891870677471161, "sampling/sampling_logp_difference/max": 0.9436951875686646, "sampling/sampling_logp_difference/mean": 0.03851354122161865, "step": 31, "step_time": 19.85710078899865 }, { "clip_ratio/high_max": 0.01759259309619665, "clip_ratio/high_mean": 0.010951468721032143, "clip_ratio/low_mean": 0.02596013550646603, "clip_ratio/low_min": 0.017273308243602514, "clip_ratio/region_mean": 0.036911603761836886, "entropy": 0.152664078399539, "epoch": 0.00032, "grad_norm": 0.4678462743759155, "kl": 0.1655140146613121, "learning_rate": 4.34e-05, "loss": 0.0015, "step": 32, "step_time": 5.196578098000828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 1106.25, "completions/mean_terminated_length": 1106.25, "completions/min_length": 1033.0, "completions/min_terminated_length": 1033.0, "entropy": 0.12942250352352858, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.6040879487991333, "kl": 0.25690607028082013, "learning_rate": 4.48e-05, "loss": -0.0023, "num_tokens": 1055695.0, "reward": -0.5021181106567383, "reward_std": 0.237224742770195, "rewards/rollout_reward_func/mean": -0.5021181106567383, "rewards/rollout_reward_func/std": 0.25904470682144165, "sampling/importance_sampling_ratio/max": 2.184476613998413, "sampling/importance_sampling_ratio/mean": 1.000129222869873, "sampling/importance_sampling_ratio/min": 0.38337117433547974, "sampling/sampling_logp_difference/max": 0.9587516784667969, "sampling/sampling_logp_difference/mean": 0.0340719148516655, "step": 33, "step_time": 19.905707126996276 }, { "clip_ratio/high_max": 0.02988505782559514, "clip_ratio/high_mean": 0.01494252891279757, "clip_ratio/low_mean": 0.025882595218718052, "clip_ratio/low_min": 0.008477011695504189, "clip_ratio/region_mean": 0.04082512413151562, "entropy": 0.12596526741981506, "epoch": 0.00034, "grad_norm": 0.3455044627189636, "kl": 0.3447681264951825, "learning_rate": 4.62e-05, "loss": -0.0057, "step": 34, "step_time": 4.614352085000064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1206.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 1105.125, "completions/mean_terminated_length": 1105.125, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "entropy": 0.12251886166632175, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 1.761839747428894, "kl": 0.30868101865053177, "learning_rate": 4.76e-05, "loss": -0.0031, "num_tokens": 1117737.0, "reward": -0.5224214792251587, "reward_std": 0.2203175276517868, "rewards/rollout_reward_func/mean": -0.5224214792251587, "rewards/rollout_reward_func/std": 0.22355018556118011, "sampling/importance_sampling_ratio/max": 2.4455726146698, "sampling/importance_sampling_ratio/mean": 1.0029780864715576, "sampling/importance_sampling_ratio/min": 0.2314785122871399, "sampling/sampling_logp_difference/max": 1.4632682800292969, "sampling/sampling_logp_difference/mean": 0.03578779101371765, "step": 35, "step_time": 19.50835349299814 }, { "clip_ratio/high_max": 0.01308497553691268, "clip_ratio/high_mean": 0.00654248776845634, "clip_ratio/low_mean": 0.025502874050289392, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03204536158591509, "entropy": 0.11845610104501247, "epoch": 0.00036, "grad_norm": 2.9334819316864014, "kl": 0.3144962238147855, "learning_rate": 4.899999999999999e-05, "loss": -0.002, "step": 36, "step_time": 4.709593042005508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1178.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 1105.1875, "completions/mean_terminated_length": 1105.1875, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "entropy": 0.10708576254546642, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 1.0198440551757812, "kl": 0.6400292217731476, "learning_rate": 5.039999999999999e-05, "loss": 0.0022, "num_tokens": 1179586.0, "reward": -0.4468666911125183, "reward_std": 0.1715434193611145, "rewards/rollout_reward_func/mean": -0.4468666911125183, "rewards/rollout_reward_func/std": 0.1782645881175995, "sampling/importance_sampling_ratio/max": 2.0236270427703857, "sampling/importance_sampling_ratio/mean": 0.9961224794387817, "sampling/importance_sampling_ratio/min": 0.04706515371799469, "sampling/sampling_logp_difference/max": 3.056222438812256, "sampling/sampling_logp_difference/mean": 0.03511383756995201, "step": 37, "step_time": 20.76767353700052 }, { "clip_ratio/high_max": 0.008630952797830105, "clip_ratio/high_mean": 0.004315476398915052, "clip_ratio/low_mean": 0.017107964726164937, "clip_ratio/low_min": 0.008630952797830105, "clip_ratio/region_mean": 0.021423440892249346, "entropy": 0.10938119702041149, "epoch": 0.00038, "grad_norm": 0.9639101028442383, "kl": 0.21173445833846927, "learning_rate": 5.179999999999999e-05, "loss": 0.0099, "step": 38, "step_time": 4.701329769999575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 1088.3125, "completions/mean_terminated_length": 1088.3125, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "entropy": 0.11678876541554928, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 1.0002318620681763, "kl": 0.215467959176749, "learning_rate": 5.32e-05, "loss": 0.0004, "num_tokens": 1241357.0, "reward": -0.46155837178230286, "reward_std": 0.22238940000534058, "rewards/rollout_reward_func/mean": -0.46155837178230286, "rewards/rollout_reward_func/std": 0.231037437915802, "sampling/importance_sampling_ratio/max": 2.318943500518799, "sampling/importance_sampling_ratio/mean": 1.0061020851135254, "sampling/importance_sampling_ratio/min": 0.36785411834716797, "sampling/sampling_logp_difference/max": 1.0000689029693604, "sampling/sampling_logp_difference/mean": 0.027780672535300255, "step": 39, "step_time": 19.769187071999113 }, { "clip_ratio/high_max": 0.029597701970487833, "clip_ratio/high_mean": 0.014798850985243917, "clip_ratio/low_mean": 0.018965517869219184, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.033764369087293744, "entropy": 0.1086076870560646, "epoch": 0.0004, "grad_norm": 0.40658679604530334, "kl": 0.22289564600214362, "learning_rate": 5.46e-05, "loss": -0.0026, "step": 40, "step_time": 4.547126809004112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 1103.53125, "completions/mean_terminated_length": 1103.53125, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "entropy": 0.10802971106022596, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 0.7768428921699524, "kl": 0.19403144717216492, "learning_rate": 5.6e-05, "loss": -0.0006, "num_tokens": 1303156.0, "reward": -0.4897119998931885, "reward_std": 0.1952064335346222, "rewards/rollout_reward_func/mean": -0.4897119998931885, "rewards/rollout_reward_func/std": 0.2068503350019455, "sampling/importance_sampling_ratio/max": 1.889296293258667, "sampling/importance_sampling_ratio/mean": 1.0051381587982178, "sampling/importance_sampling_ratio/min": 0.40923118591308594, "sampling/sampling_logp_difference/max": 0.8934750556945801, "sampling/sampling_logp_difference/mean": 0.021494271233677864, "step": 41, "step_time": 19.853896857999644 }, { "clip_ratio/high_max": 0.029885058291256428, "clip_ratio/high_mean": 0.019181034993380308, "clip_ratio/low_mean": 0.01688218442723155, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.036063219187781215, "entropy": 0.1090830909088254, "epoch": 0.00042, "grad_norm": 0.3662092983722687, "kl": 0.20994073105975986, "learning_rate": 5.739999999999999e-05, "loss": -0.0041, "step": 42, "step_time": 5.160369818000618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1173.0, "completions/max_terminated_length": 1173.0, "completions/mean_length": 1088.65625, "completions/mean_terminated_length": 1088.65625, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "entropy": 0.11884008627384901, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.6067479848861694, "kl": 0.2437204960733652, "learning_rate": 5.879999999999999e-05, "loss": -0.0022, "num_tokens": 1364489.0, "reward": -0.5077904462814331, "reward_std": 0.20108559727668762, "rewards/rollout_reward_func/mean": -0.5077904462814331, "rewards/rollout_reward_func/std": 0.22733040153980255, "sampling/importance_sampling_ratio/max": 1.4460588693618774, "sampling/importance_sampling_ratio/mean": 0.9939672946929932, "sampling/importance_sampling_ratio/min": 0.29875218868255615, "sampling/sampling_logp_difference/max": 1.2081408500671387, "sampling/sampling_logp_difference/mean": 0.0232787374407053, "step": 43, "step_time": 19.193595336002545 }, { "clip_ratio/high_max": 0.030224869027733803, "clip_ratio/high_mean": 0.017195767955854535, "clip_ratio/low_mean": 0.027833653381094337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.045029421569779515, "entropy": 0.11338446941226721, "epoch": 0.00044, "grad_norm": 0.4423329532146454, "kl": 0.5116472188383341, "learning_rate": 6.019999999999999e-05, "loss": -0.0057, "step": 44, "step_time": 4.609670715999528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 1076.71875, "completions/mean_terminated_length": 1076.71875, "completions/min_length": 1027.0, "completions/min_terminated_length": 1027.0, "entropy": 0.0988692631945014, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 1.5183652639389038, "kl": 0.22475394513458014, "learning_rate": 6.159999999999999e-05, "loss": 0.003, "num_tokens": 1425654.0, "reward": -0.45213308930397034, "reward_std": 0.14726360142230988, "rewards/rollout_reward_func/mean": -0.45213308930397034, "rewards/rollout_reward_func/std": 0.20719100534915924, "sampling/importance_sampling_ratio/max": 2.3679933547973633, "sampling/importance_sampling_ratio/mean": 0.9978402256965637, "sampling/importance_sampling_ratio/min": 0.4819342792034149, "sampling/sampling_logp_difference/max": 0.8620429039001465, "sampling/sampling_logp_difference/mean": 0.023353304713964462, "step": 45, "step_time": 19.132591851001052 }, { "clip_ratio/high_max": 0.02514367876574397, "clip_ratio/high_mean": 0.012571839382871985, "clip_ratio/low_mean": 0.011711712228134274, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.02428355161100626, "entropy": 0.09813301777467132, "epoch": 0.00046, "grad_norm": 0.3528969883918762, "kl": 0.21378233283758163, "learning_rate": 6.3e-05, "loss": 0.001, "step": 46, "step_time": 4.613903227995252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 1095.65625, "completions/mean_terminated_length": 1095.65625, "completions/min_length": 1033.0, "completions/min_terminated_length": 1033.0, "entropy": 0.12270896788686514, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 0.6892234086990356, "kl": 0.3763636499643326, "learning_rate": 6.44e-05, "loss": 0.0028, "num_tokens": 1487583.0, "reward": -0.4293076992034912, "reward_std": 0.21600130200386047, "rewards/rollout_reward_func/mean": -0.4293076992034912, "rewards/rollout_reward_func/std": 0.23292233049869537, "sampling/importance_sampling_ratio/max": 1.8867154121398926, "sampling/importance_sampling_ratio/mean": 1.004943609237671, "sampling/importance_sampling_ratio/min": 0.6363580226898193, "sampling/sampling_logp_difference/max": 0.6348373889923096, "sampling/sampling_logp_difference/mean": 0.019537419080734253, "step": 47, "step_time": 20.42858671600152 }, { "clip_ratio/high_max": 0.025431035086512566, "clip_ratio/high_mean": 0.0148706897161901, "clip_ratio/low_mean": 0.027332287514582276, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.042202977230772376, "entropy": 0.13431571517139673, "epoch": 0.00048, "grad_norm": 0.2712605595588684, "kl": 0.38993188738822937, "learning_rate": 6.579999999999999e-05, "loss": -0.0011, "step": 48, "step_time": 4.589837898001861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1157.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 1093.09375, "completions/mean_terminated_length": 1093.09375, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "entropy": 0.1185915432870388, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 0.4226152002811432, "kl": 0.7659748941659927, "learning_rate": 6.72e-05, "loss": -0.0082, "num_tokens": 1548981.0, "reward": -0.4821324944496155, "reward_std": 0.21742115914821625, "rewards/rollout_reward_func/mean": -0.4821324944496155, "rewards/rollout_reward_func/std": 0.21750490367412567, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 1.009469985961914, "sampling/importance_sampling_ratio/min": 0.18863405287265778, "sampling/sampling_logp_difference/max": 1.6679463386535645, "sampling/sampling_logp_difference/mean": 0.03913179039955139, "step": 49, "step_time": 18.02275065199865 }, { "clip_ratio/high_max": 0.017251642420887947, "clip_ratio/high_mean": 0.008625821210443974, "clip_ratio/low_mean": 0.04806405236013234, "clip_ratio/low_min": 0.03392857313156128, "clip_ratio/region_mean": 0.056689873803406954, "entropy": 0.09834497049450874, "epoch": 0.0005, "grad_norm": 1.6209301948547363, "kl": 2.579437732696533, "learning_rate": 6.859999999999999e-05, "loss": 0.0048, "step": 50, "step_time": 4.540976131995194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 1088.96875, "completions/mean_terminated_length": 1088.96875, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "entropy": 0.08731328183785081, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 0.5635143518447876, "kl": 1.0861408486962318, "learning_rate": 7e-05, "loss": -0.0045, "num_tokens": 1610094.0, "reward": -0.43637198209762573, "reward_std": 0.27633821964263916, "rewards/rollout_reward_func/mean": -0.43637198209762573, "rewards/rollout_reward_func/std": 0.2921752333641052, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 1.0049779415130615, "sampling/importance_sampling_ratio/min": 0.3428148925304413, "sampling/sampling_logp_difference/max": 1.45022714138031, "sampling/sampling_logp_difference/mean": 0.021649528294801712, "step": 51, "step_time": 17.871381629000098 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0172573437448591, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021424010628834367, "entropy": 0.08626179583370686, "epoch": 0.00052, "grad_norm": 0.34038931131362915, "kl": 1.275597222149372, "learning_rate": 6.999927501348549e-05, "loss": -0.0046, "step": 52, "step_time": 5.037914795002507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 1095.8125, "completions/mean_terminated_length": 1094.3548583984375, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "entropy": 0.12115000281482935, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.967182993888855, "kl": 0.9825721606612206, "learning_rate": 6.999710008927679e-05, "loss": -0.0563, "num_tokens": 1671764.0, "reward": -0.4696105122566223, "reward_std": 0.22791525721549988, "rewards/rollout_reward_func/mean": -0.4696105122566223, "rewards/rollout_reward_func/std": 0.2283819615840912, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 1.0037590265274048, "sampling/importance_sampling_ratio/min": 0.05342705920338631, "sampling/sampling_logp_difference/max": 2.9294378757476807, "sampling/sampling_logp_difference/mean": 0.03147841617465019, "step": 53, "step_time": 18.86263157599933 }, { "clip_ratio/high_max": 0.03362069092690945, "clip_ratio/high_mean": 0.021120690274983644, "clip_ratio/low_mean": 0.02388250338844955, "clip_ratio/low_min": 0.008620689623057842, "clip_ratio/region_mean": 0.04500319389626384, "entropy": 0.125062495470047, "epoch": 0.00054, "grad_norm": 0.4010683298110962, "kl": 0.8681858591735363, "learning_rate": 6.999347533337664e-05, "loss": -0.0575, "step": 54, "step_time": 4.628001205999681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 1088.75, "completions/mean_terminated_length": 1088.75, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "entropy": 0.14726589154452085, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.7700212001800537, "kl": 1.3244657516479492, "learning_rate": 6.998840092245053e-05, "loss": 0.012, "num_tokens": 1732380.0, "reward": -0.4926993250846863, "reward_std": 0.24518096446990967, "rewards/rollout_reward_func/mean": -0.4926993250846863, "rewards/rollout_reward_func/std": 0.2516820430755615, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 1.0008070468902588, "sampling/importance_sampling_ratio/min": 0.48306187987327576, "sampling/sampling_logp_difference/max": 1.1459065675735474, "sampling/sampling_logp_difference/mean": 0.025315191596746445, "step": 55, "step_time": 18.05787570999746 }, { "clip_ratio/high_max": 0.02216748846694827, "clip_ratio/high_mean": 0.013167077675461769, "clip_ratio/low_mean": 0.025491471402347088, "clip_ratio/low_min": 0.004629629664123058, "clip_ratio/region_mean": 0.03865854907780886, "entropy": 0.1365279834717512, "epoch": 0.00056, "grad_norm": 0.3563438653945923, "kl": 1.4614362269639969, "learning_rate": 6.99818771038181e-05, "loss": 0.0096, "step": 56, "step_time": 5.074325516001409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 1072.375, "completions/mean_terminated_length": 1072.375, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "entropy": 0.10837440472096205, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 0.5596420168876648, "kl": 1.037670575082302, "learning_rate": 6.997390419544103e-05, "loss": 0.0043, "num_tokens": 1792635.0, "reward": -0.5355521440505981, "reward_std": 0.14530688524246216, "rewards/rollout_reward_func/mean": -0.5355521440505981, "rewards/rollout_reward_func/std": 0.19063040614128113, "sampling/importance_sampling_ratio/max": 1.5926164388656616, "sampling/importance_sampling_ratio/mean": 1.0045462846755981, "sampling/importance_sampling_ratio/min": 0.30828800797462463, "sampling/sampling_logp_difference/max": 1.1767208576202393, "sampling/sampling_logp_difference/mean": 0.018672389909625053, "step": 57, "step_time": 18.684464720001415 }, { "clip_ratio/high_max": 0.026808520779013634, "clip_ratio/high_mean": 0.02188127231784165, "clip_ratio/low_mean": 0.016882183961570263, "clip_ratio/low_min": 0.004310344811528921, "clip_ratio/region_mean": 0.038763455813750625, "entropy": 0.10703121963888407, "epoch": 0.00058, "grad_norm": 0.1872505098581314, "kl": 1.074892871081829, "learning_rate": 6.996448258590766e-05, "loss": 0.0015, "step": 58, "step_time": 4.56781186599801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 1086.71875, "completions/mean_terminated_length": 1086.71875, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "entropy": 0.10833515599370003, "epoch": 0.00059, "frac_reward_zero_std": 0.0, "grad_norm": 0.7075782418251038, "kl": 1.0720560997724533, "learning_rate": 6.995361273441393e-05, "loss": 0.0044, "num_tokens": 1853387.0, "reward": -0.47640368342399597, "reward_std": 0.21761885285377502, "rewards/rollout_reward_func/mean": -0.47640368342399597, "rewards/rollout_reward_func/std": 0.22357192635536194, "sampling/importance_sampling_ratio/max": 1.327568769454956, "sampling/importance_sampling_ratio/mean": 0.9998874068260193, "sampling/importance_sampling_ratio/min": 0.6657251119613647, "sampling/sampling_logp_difference/max": 0.4068784713745117, "sampling/sampling_logp_difference/mean": 0.015214748680591583, "step": 59, "step_time": 18.160416403001364 }, { "clip_ratio/high_max": 0.025297620333731174, "clip_ratio/high_mean": 0.014732143841683865, "clip_ratio/low_mean": 0.014870690181851387, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.02960283402353525, "entropy": 0.10951074492186308, "epoch": 0.0006, "grad_norm": 0.39131033420562744, "kl": 1.0591960176825523, "learning_rate": 6.994129517074108e-05, "loss": 0.0007, "step": 60, "step_time": 4.721910889000355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 1115.6875, "completions/mean_terminated_length": 1115.6875, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "entropy": 0.10607703868299723, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 0.2872762084007263, "kl": 1.0350849702954292, "learning_rate": 6.992753049522976e-05, "loss": 0.0054, "num_tokens": 1915315.0, "reward": -0.43617352843284607, "reward_std": 0.2522484064102173, "rewards/rollout_reward_func/mean": -0.43617352843284607, "rewards/rollout_reward_func/std": 0.2582774758338928, "sampling/importance_sampling_ratio/max": 1.2695132493972778, "sampling/importance_sampling_ratio/mean": 0.9975636005401611, "sampling/importance_sampling_ratio/min": 0.3743976354598999, "sampling/sampling_logp_difference/max": 0.9824368953704834, "sampling/sampling_logp_difference/mean": 0.014042620547115803, "step": 61, "step_time": 18.545257886997206 }, { "clip_ratio/high_max": 0.02202495001256466, "clip_ratio/high_mean": 0.013095808448269963, "clip_ratio/low_mean": 0.020183361135423183, "clip_ratio/low_min": 0.00893997447565198, "clip_ratio/region_mean": 0.033279168885201216, "entropy": 0.11789484415203333, "epoch": 0.00062, "grad_norm": 0.16561837494373322, "kl": 0.9820755273103714, "learning_rate": 6.991231937875088e-05, "loss": 0.0026, "step": 62, "step_time": 5.082353541003613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1183.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 1099.46875, "completions/mean_terminated_length": 1098.806396484375, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "entropy": 0.23455523513257504, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 0.5485014319419861, "kl": 0.8900921568274498, "learning_rate": 6.989566256267274e-05, "loss": 0.0607, "num_tokens": 1976587.0, "reward": -0.5591320991516113, "reward_std": 0.17651371657848358, "rewards/rollout_reward_func/mean": -0.5591320991516113, "rewards/rollout_reward_func/std": 0.17946277558803558, "sampling/importance_sampling_ratio/max": 1.9399454593658447, "sampling/importance_sampling_ratio/mean": 1.0042798519134521, "sampling/importance_sampling_ratio/min": 0.5940226912498474, "sampling/sampling_logp_difference/max": 0.6626598834991455, "sampling/sampling_logp_difference/mean": 0.021330349147319794, "step": 63, "step_time": 18.45355869899504 }, { "clip_ratio/high_max": 0.04348522052168846, "clip_ratio/high_mean": 0.023974753450602293, "clip_ratio/low_mean": 0.01734141679480672, "clip_ratio/low_min": 0.004261363763362169, "clip_ratio/region_mean": 0.041316170478239655, "entropy": 0.24474226590245962, "epoch": 0.00064, "grad_norm": 0.276520311832428, "kl": 0.8789360150694847, "learning_rate": 6.98775608588251e-05, "loss": 0.0539, "step": 64, "step_time": 4.612848719996691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1194.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 1060.1875, "completions/mean_terminated_length": 1061.1334228515625, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "entropy": 0.28550489246845245, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 0.626975953578949, "kl": 0.8598505482077599, "learning_rate": 6.985801514945947e-05, "loss": 0.1307, "num_tokens": 2036319.0, "reward": -0.3970026671886444, "reward_std": 0.2564680576324463, "rewards/rollout_reward_func/mean": -0.3970026671886444, "rewards/rollout_reward_func/std": 0.26500940322875977, "sampling/importance_sampling_ratio/max": 1.4650014638900757, "sampling/importance_sampling_ratio/mean": 0.9992122650146484, "sampling/importance_sampling_ratio/min": 0.5915941596031189, "sampling/sampling_logp_difference/max": 0.5249343514442444, "sampling/sampling_logp_difference/mean": 0.030375786125659943, "step": 65, "step_time": 18.97966624000037 }, { "clip_ratio/high_max": 0.03590243705548346, "clip_ratio/high_mean": 0.022587207495234907, "clip_ratio/low_mean": 0.02136340644210577, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.043950614403001964, "entropy": 0.30166169814765453, "epoch": 0.00066, "grad_norm": 0.3809885084629059, "kl": 0.8402063176035881, "learning_rate": 6.983702638720613e-05, "loss": 0.1209, "step": 66, "step_time": 5.571784341998864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1195.0, "completions/max_terminated_length": 1195.0, "completions/mean_length": 1112.3125, "completions/mean_terminated_length": 1115.806396484375, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "entropy": 0.17948718275874853, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 0.4068567752838135, "kl": 0.48361456394195557, "learning_rate": 6.981459559502773e-05, "loss": 0.0753, "num_tokens": 2098073.0, "reward": -0.5098980665206909, "reward_std": 0.270857036113739, "rewards/rollout_reward_func/mean": -0.5098980665206909, "rewards/rollout_reward_func/std": 0.27474111318588257, "sampling/importance_sampling_ratio/max": 1.618735432624817, "sampling/importance_sampling_ratio/mean": 0.9959070682525635, "sampling/importance_sampling_ratio/min": 0.3979490399360657, "sampling/sampling_logp_difference/max": 0.921431303024292, "sampling/sampling_logp_difference/mean": 0.023701874539256096, "step": 67, "step_time": 18.83939884999927 }, { "clip_ratio/high_max": 0.01742724934592843, "clip_ratio/high_mean": 0.008713624672964215, "clip_ratio/low_mean": 0.0246296098921448, "clip_ratio/low_min": 0.007007576059550047, "clip_ratio/region_mean": 0.03334323433227837, "entropy": 0.17947692796587944, "epoch": 0.00068, "grad_norm": 0.36956554651260376, "kl": 0.5682570077478886, "learning_rate": 6.979072386616947e-05, "loss": 0.0717, "step": 68, "step_time": 4.623302149002484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1287.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 1077.71875, "completions/mean_terminated_length": 1068.10009765625, "completions/min_length": 471.0, "completions/min_terminated_length": 471.0, "entropy": 0.2892078459262848, "epoch": 0.00069, "frac_reward_zero_std": 0.0, "grad_norm": 0.7553045153617859, "kl": 0.7609045431017876, "learning_rate": 6.97654123641057e-05, "loss": 0.1371, "num_tokens": 2158751.0, "reward": -0.46218380331993103, "reward_std": 0.24525763094425201, "rewards/rollout_reward_func/mean": -0.46218380331993103, "rewards/rollout_reward_func/std": 0.25703689455986023, "sampling/importance_sampling_ratio/max": 2.140455722808838, "sampling/importance_sampling_ratio/mean": 0.9979419112205505, "sampling/importance_sampling_ratio/min": 0.2021942287683487, "sampling/sampling_logp_difference/max": 1.5985264778137207, "sampling/sampling_logp_difference/mean": 0.032274819910526276, "step": 69, "step_time": 18.749426872000186 }, { "clip_ratio/high_max": 0.03526386618614197, "clip_ratio/high_mean": 0.021942277904599905, "clip_ratio/low_mean": 0.015378695214167237, "clip_ratio/low_min": 0.0028735632076859474, "clip_ratio/region_mean": 0.037320973351597786, "entropy": 0.2664843760430813, "epoch": 0.0007, "grad_norm": 0.38778552412986755, "kl": 0.689332477748394, "learning_rate": 6.973866232248336e-05, "loss": 0.1314, "step": 70, "step_time": 4.780764236003961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 1092.46875, "completions/mean_terminated_length": 1092.46875, "completions/min_length": 1034.0, "completions/min_terminated_length": 1034.0, "entropy": 0.1402003513649106, "epoch": 0.00071, "frac_reward_zero_std": 0.0, "grad_norm": 0.45844557881355286, "kl": 0.6303279027342796, "learning_rate": 6.971047504506171e-05, "loss": 0.0011, "num_tokens": 2219634.0, "reward": -0.36640745401382446, "reward_std": 0.22457873821258545, "rewards/rollout_reward_func/mean": -0.36640745401382446, "rewards/rollout_reward_func/std": 0.2406391054391861, "sampling/importance_sampling_ratio/max": 1.1999346017837524, "sampling/importance_sampling_ratio/mean": 0.9979254007339478, "sampling/importance_sampling_ratio/min": 0.5383814573287964, "sampling/sampling_logp_difference/max": 0.6191879510879517, "sampling/sampling_logp_difference/mean": 0.013366520404815674, "step": 71, "step_time": 20.457377540999005 }, { "clip_ratio/high_max": 0.026223545894026756, "clip_ratio/high_mean": 0.015266945119947195, "clip_ratio/low_mean": 0.006547619355842471, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021814564242959023, "entropy": 0.13642335776239634, "epoch": 0.00072, "grad_norm": 0.18443915247917175, "kl": 0.6241349279880524, "learning_rate": 6.96808519056489e-05, "loss": -0.0005, "step": 72, "step_time": 4.561201645999972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 1119.65625, "completions/mean_terminated_length": 1119.65625, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "entropy": 0.1595595609396696, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.5977839827537537, "kl": 0.842487521469593, "learning_rate": 6.964979434803495e-05, "loss": 0.0038, "num_tokens": 2281281.0, "reward": -0.4714810848236084, "reward_std": 0.25829970836639404, "rewards/rollout_reward_func/mean": -0.4714810848236084, "rewards/rollout_reward_func/std": 0.2600136995315552, "sampling/importance_sampling_ratio/max": 2.3782870769500732, "sampling/importance_sampling_ratio/mean": 1.000692367553711, "sampling/importance_sampling_ratio/min": 0.6183164119720459, "sampling/sampling_logp_difference/max": 0.8663804531097412, "sampling/sampling_logp_difference/mean": 0.018118811771273613, "step": 73, "step_time": 19.065980461997242 }, { "clip_ratio/high_max": 0.022663519717752934, "clip_ratio/high_mean": 0.011331759858876467, "clip_ratio/low_mean": 0.02188127231784165, "clip_ratio/low_min": 0.004310344811528921, "clip_ratio/region_mean": 0.033213032176718116, "entropy": 0.16215053666383028, "epoch": 0.00074, "grad_norm": 0.23800618946552277, "kl": 0.824254896491766, "learning_rate": 6.961730388592139e-05, "loss": -0.0001, "step": 74, "step_time": 4.669764472999304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1119.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 1067.3125, "completions/mean_terminated_length": 1067.3125, "completions/min_length": 1019.0, "completions/min_terminated_length": 1019.0, "entropy": 0.1730109006166458, "epoch": 0.00075, "frac_reward_zero_std": 0.0, "grad_norm": 0.47316476702690125, "kl": 0.906402237713337, "learning_rate": 6.958338210284752e-05, "loss": 0.0057, "num_tokens": 2341341.0, "reward": -0.42323291301727295, "reward_std": 0.1940966248512268, "rewards/rollout_reward_func/mean": -0.42323291301727295, "rewards/rollout_reward_func/std": 0.20036768913269043, "sampling/importance_sampling_ratio/max": 1.598193645477295, "sampling/importance_sampling_ratio/mean": 0.993659257888794, "sampling/importance_sampling_ratio/min": 0.4855867624282837, "sampling/sampling_logp_difference/max": 0.7223973274230957, "sampling/sampling_logp_difference/mean": 0.02255510911345482, "step": 75, "step_time": 18.786332894002044 }, { "clip_ratio/high_max": 0.04683908121660352, "clip_ratio/high_mean": 0.02341954060830176, "clip_ratio/low_mean": 0.025508005637675524, "clip_ratio/low_min": 0.004464285913854837, "clip_ratio/region_mean": 0.04892754601314664, "entropy": 0.1801851950585842, "epoch": 0.00076, "grad_norm": 0.3213146924972534, "kl": 1.1258291900157928, "learning_rate": 6.954803065211319e-05, "loss": 0.002, "step": 76, "step_time": 5.470975843998531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 1094.8125, "completions/mean_terminated_length": 1094.8125, "completions/min_length": 1008.0, "completions/min_terminated_length": 1008.0, "entropy": 0.18080324493348598, "epoch": 0.00077, "frac_reward_zero_std": 0.0, "grad_norm": 0.3310779929161072, "kl": 0.5927716679871082, "learning_rate": 6.951125125669823e-05, "loss": 0.0064, "num_tokens": 2402437.0, "reward": -0.4297844171524048, "reward_std": 0.23342680931091309, "rewards/rollout_reward_func/mean": -0.4297844171524048, "rewards/rollout_reward_func/std": 0.23833869397640228, "sampling/importance_sampling_ratio/max": 1.3645645380020142, "sampling/importance_sampling_ratio/mean": 0.9980795383453369, "sampling/importance_sampling_ratio/min": 0.6917573809623718, "sampling/sampling_logp_difference/max": 0.36852002143859863, "sampling/sampling_logp_difference/mean": 0.012174522504210472, "step": 77, "step_time": 18.89690311800041 }, { "clip_ratio/high_max": 0.034800903871655464, "clip_ratio/high_mean": 0.017400451935827732, "clip_ratio/low_mean": 0.023357964819297194, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.04075841698795557, "entropy": 0.19196703471243382, "epoch": 0.00078, "grad_norm": 0.2028847187757492, "kl": 0.5656262673437595, "learning_rate": 6.947304570917848e-05, "loss": 0.0024, "step": 78, "step_time": 4.64138041499973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 1101.84375, "completions/mean_terminated_length": 1101.84375, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "entropy": 0.1995609998703003, "epoch": 0.00079, "frac_reward_zero_std": 0.0, "grad_norm": 0.42011263966560364, "kl": 0.5383942537009716, "learning_rate": 6.943341587163845e-05, "loss": 0.0026, "num_tokens": 2463358.0, "reward": -0.49121206998825073, "reward_std": 0.22279031574726105, "rewards/rollout_reward_func/mean": -0.49121206998825073, "rewards/rollout_reward_func/std": 0.21940594911575317, "sampling/importance_sampling_ratio/max": 1.432817816734314, "sampling/importance_sampling_ratio/mean": 1.0003972053527832, "sampling/importance_sampling_ratio/min": 0.6260071396827698, "sampling/sampling_logp_difference/max": 0.46839356422424316, "sampling/sampling_logp_difference/mean": 0.015329085290431976, "step": 79, "step_time": 18.978078880001704 }, { "clip_ratio/high_max": 0.0218698694370687, "clip_ratio/high_mean": 0.013018268393352628, "clip_ratio/low_mean": 0.010791826527565718, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.023810094688087702, "entropy": 0.20319851115345955, "epoch": 0.0008, "grad_norm": 0.203871488571167, "kl": 0.5247991774231195, "learning_rate": 6.939236367558048e-05, "loss": -0.0008, "step": 80, "step_time": 5.020078220000869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 1099.6875, "completions/mean_terminated_length": 1099.6875, "completions/min_length": 1007.0, "completions/min_terminated_length": 1007.0, "entropy": 0.1919056735932827, "epoch": 0.00081, "frac_reward_zero_std": 0.0, "grad_norm": 0.35180771350860596, "kl": 0.4909706600010395, "learning_rate": 6.934989112183072e-05, "loss": -0.0012, "num_tokens": 2524600.0, "reward": -0.4302617609500885, "reward_std": 0.23076964914798737, "rewards/rollout_reward_func/mean": -0.4302617609500885, "rewards/rollout_reward_func/std": 0.22744397819042206, "sampling/importance_sampling_ratio/max": 1.2500364780426025, "sampling/importance_sampling_ratio/mean": 0.9998369812965393, "sampling/importance_sampling_ratio/min": 0.5982013940811157, "sampling/sampling_logp_difference/max": 0.5138278007507324, "sampling/sampling_logp_difference/mean": 0.012774912640452385, "step": 81, "step_time": 20.1531525529972 }, { "clip_ratio/high_max": 0.0218698694370687, "clip_ratio/high_mean": 0.01093493471853435, "clip_ratio/low_mean": 0.006784802069887519, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017719736555591226, "entropy": 0.19604826532304287, "epoch": 0.00082, "grad_norm": 0.19855517148971558, "kl": 0.4264661520719528, "learning_rate": 6.93060002804415e-05, "loss": -0.0053, "step": 82, "step_time": 4.572546813998997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 1097.65625, "completions/mean_terminated_length": 1097.65625, "completions/min_length": 957.0, "completions/min_terminated_length": 957.0, "entropy": 0.23367414996027946, "epoch": 0.00083, "frac_reward_zero_std": 0.0, "grad_norm": 0.48136112093925476, "kl": 0.37488402239978313, "learning_rate": 6.92606932905905e-05, "loss": 0.004, "num_tokens": 2585774.0, "reward": -0.40845346450805664, "reward_std": 0.25028541684150696, "rewards/rollout_reward_func/mean": -0.40845346450805664, "rewards/rollout_reward_func/std": 0.2632640302181244, "sampling/importance_sampling_ratio/max": 1.4429755210876465, "sampling/importance_sampling_ratio/mean": 1.0003565549850464, "sampling/importance_sampling_ratio/min": 0.5461632013320923, "sampling/sampling_logp_difference/max": 0.6048374176025391, "sampling/sampling_logp_difference/mean": 0.014902510680258274, "step": 83, "step_time": 20.181918171998404 }, { "clip_ratio/high_max": 0.02945402404293418, "clip_ratio/high_mean": 0.01903735683299601, "clip_ratio/low_mean": 0.015029762871563435, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.034067119704559445, "entropy": 0.23336116783320904, "epoch": 0.00084, "grad_norm": 0.2826317846775055, "kl": 0.5691159926354885, "learning_rate": 6.921397236047651e-05, "loss": 0.0001, "step": 84, "step_time": 4.553323033000197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1173.0, "completions/max_terminated_length": 1173.0, "completions/mean_length": 1089.4375, "completions/mean_terminated_length": 1089.4375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.22294274903833866, "epoch": 0.00085, "frac_reward_zero_std": 0.0, "grad_norm": 0.44069260358810425, "kl": 0.39200756326317787, "learning_rate": 6.916583976721175e-05, "loss": -0.0108, "num_tokens": 2646311.0, "reward": -0.4817659258842468, "reward_std": 0.19066353142261505, "rewards/rollout_reward_func/mean": -0.4817659258842468, "rewards/rollout_reward_func/std": 0.19899514317512512, "sampling/importance_sampling_ratio/max": 1.2166787385940552, "sampling/importance_sampling_ratio/mean": 0.9984630346298218, "sampling/importance_sampling_ratio/min": 0.42864790558815, "sampling/sampling_logp_difference/max": 0.8471194505691528, "sampling/sampling_logp_difference/mean": 0.016487441956996918, "step": 85, "step_time": 20.32028162899951 }, { "clip_ratio/high_max": 0.01278735650703311, "clip_ratio/high_mean": 0.006393678253516555, "clip_ratio/low_mean": 0.026142063783481717, "clip_ratio/low_min": 0.008477011695504189, "clip_ratio/region_mean": 0.03253574203699827, "entropy": 0.22116324119269848, "epoch": 0.00086, "grad_norm": 0.23452584445476532, "kl": 0.40880732610821724, "learning_rate": 6.911629785671089e-05, "loss": -0.014, "step": 86, "step_time": 5.056672691000131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 1115.375, "completions/mean_terminated_length": 1115.375, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "entropy": 0.20760609582066536, "epoch": 0.00087, "frac_reward_zero_std": 0.0, "grad_norm": 0.35622408986091614, "kl": 0.3480166494846344, "learning_rate": 6.906534904357676e-05, "loss": -0.0003, "num_tokens": 2707609.0, "reward": -0.4962503910064697, "reward_std": 0.2686586380004883, "rewards/rollout_reward_func/mean": -0.4962503910064697, "rewards/rollout_reward_func/std": 0.26484936475753784, "sampling/importance_sampling_ratio/max": 1.2621374130249023, "sampling/importance_sampling_ratio/mean": 1.0007240772247314, "sampling/importance_sampling_ratio/min": 0.7976197004318237, "sampling/sampling_logp_difference/max": 0.23280668258666992, "sampling/sampling_logp_difference/mean": 0.010749276727437973, "step": 87, "step_time": 19.595415535002758 }, { "clip_ratio/high_max": 0.01308497553691268, "clip_ratio/high_mean": 0.0086976601742208, "clip_ratio/low_mean": 0.012864327291026711, "clip_ratio/low_min": 0.004464285913854837, "clip_ratio/region_mean": 0.021561987232416868, "entropy": 0.21387534961104393, "epoch": 0.00088, "grad_norm": 0.19298388063907623, "kl": 0.34438998997211456, "learning_rate": 6.901299581098266e-05, "loss": -0.003, "step": 88, "step_time": 4.478479118995892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 1070.125, "completions/mean_terminated_length": 1070.125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.2488942462950945, "epoch": 0.00089, "frac_reward_zero_std": 0.0, "grad_norm": 0.33588406443595886, "kl": 0.3668687706813216, "learning_rate": 6.895924071055127e-05, "loss": -0.006, "num_tokens": 2767809.0, "reward": -0.47351884841918945, "reward_std": 0.2179202437400818, "rewards/rollout_reward_func/mean": -0.47351884841918945, "rewards/rollout_reward_func/std": 0.235873281955719, "sampling/importance_sampling_ratio/max": 1.6022002696990967, "sampling/importance_sampling_ratio/mean": 1.0035170316696167, "sampling/importance_sampling_ratio/min": 0.7893701195716858, "sampling/sampling_logp_difference/max": 0.4713778495788574, "sampling/sampling_logp_difference/mean": 0.013071735389530659, "step": 89, "step_time": 19.59138237300067 }, { "clip_ratio/high_max": 0.039760081097483635, "clip_ratio/high_mean": 0.02211218373849988, "clip_ratio/low_mean": 0.009011243702843785, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031123427441343665, "entropy": 0.2663054373115301, "epoch": 0.0009, "grad_norm": 0.14179401099681854, "kl": 0.34100816398859024, "learning_rate": 6.890408636223033e-05, "loss": -0.0099, "step": 90, "step_time": 5.594045355001072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 1082.71875, "completions/mean_terminated_length": 1082.71875, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "entropy": 0.3272032327950001, "epoch": 0.00091, "frac_reward_zero_std": 0.0, "grad_norm": 0.39047959446907043, "kl": 0.5060490854084492, "learning_rate": 6.884753545416499e-05, "loss": 0.0004, "num_tokens": 2828209.0, "reward": -0.44702208042144775, "reward_std": 0.1768151968717575, "rewards/rollout_reward_func/mean": -0.44702208042144775, "rewards/rollout_reward_func/std": 0.19843530654907227, "sampling/importance_sampling_ratio/max": 1.313504695892334, "sampling/importance_sampling_ratio/mean": 0.9998639822006226, "sampling/importance_sampling_ratio/min": 0.7137824892997742, "sampling/sampling_logp_difference/max": 0.337177038192749, "sampling/sampling_logp_difference/mean": 0.017330652102828026, "step": 91, "step_time": 19.705118409001443 }, { "clip_ratio/high_max": 0.029597701504826546, "clip_ratio/high_mean": 0.016954023158177733, "clip_ratio/low_mean": 0.02308728452771902, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.040041307685896754, "entropy": 0.338189210742712, "epoch": 0.00092, "grad_norm": 0.25255584716796875, "kl": 0.5081073250621557, "learning_rate": 6.87895907425667e-05, "loss": -0.0037, "step": 92, "step_time": 4.545762452002236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1215.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 1121.0, "completions/mean_terminated_length": 1121.0, "completions/min_length": 1044.0, "completions/min_terminated_length": 1044.0, "entropy": 0.31118444725871086, "epoch": 0.00093, "frac_reward_zero_std": 0.0, "grad_norm": 0.3357398509979248, "kl": 0.41464070603251457, "learning_rate": 6.873025505157899e-05, "loss": 0.0029, "num_tokens": 2889994.0, "reward": -0.48655378818511963, "reward_std": 0.2266092300415039, "rewards/rollout_reward_func/mean": -0.48655378818511963, "rewards/rollout_reward_func/std": 0.2270657867193222, "sampling/importance_sampling_ratio/max": 1.1633861064910889, "sampling/importance_sampling_ratio/mean": 0.9989911913871765, "sampling/importance_sampling_ratio/min": 0.8115768432617188, "sampling/sampling_logp_difference/max": 0.20877623558044434, "sampling/sampling_logp_difference/mean": 0.011798782274127007, "step": 93, "step_time": 19.783363214997735 }, { "clip_ratio/high_max": 0.03449302213266492, "clip_ratio/high_mean": 0.023496511159464717, "clip_ratio/low_mean": 0.023578613065183163, "clip_ratio/low_min": 0.008477011695504189, "clip_ratio/region_mean": 0.04707512445747852, "entropy": 0.3337353691458702, "epoch": 0.00094, "grad_norm": 0.17373445630073547, "kl": 0.4239913858473301, "learning_rate": 6.866953127313971e-05, "loss": -0.0003, "step": 94, "step_time": 4.663236467002207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1191.0, "completions/max_terminated_length": 1191.0, "completions/mean_length": 1107.34375, "completions/mean_terminated_length": 1107.34375, "completions/min_length": 988.0, "completions/min_terminated_length": 988.0, "entropy": 0.3390605188906193, "epoch": 0.00095, "frac_reward_zero_std": 0.0, "grad_norm": 0.32190436124801636, "kl": 0.3996665310114622, "learning_rate": 6.860742236684017e-05, "loss": 0.003, "num_tokens": 2951206.0, "reward": -0.4408472180366516, "reward_std": 0.26517271995544434, "rewards/rollout_reward_func/mean": -0.4408472180366516, "rewards/rollout_reward_func/std": 0.2688426673412323, "sampling/importance_sampling_ratio/max": 1.2432407140731812, "sampling/importance_sampling_ratio/mean": 1.0007131099700928, "sampling/importance_sampling_ratio/min": 0.7838740944862366, "sampling/sampling_logp_difference/max": 0.24350690841674805, "sampling/sampling_logp_difference/mean": 0.014349516481161118, "step": 95, "step_time": 20.95830778900381 }, { "clip_ratio/high_max": 0.029741379898041487, "clip_ratio/high_mean": 0.017025862354785204, "clip_ratio/low_mean": 0.015014367876574397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.032040230464190245, "entropy": 0.33729221299290657, "epoch": 0.00096, "grad_norm": 0.1863829344511032, "kl": 0.3920147344470024, "learning_rate": 6.854393135978081e-05, "loss": -0.0013, "step": 96, "step_time": 4.508435823005129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 1086.8125, "completions/mean_terminated_length": 1086.8125, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "entropy": 0.3320203050971031, "epoch": 0.00097, "frac_reward_zero_std": 0.0, "grad_norm": 0.3696032166481018, "kl": 0.4638562463223934, "learning_rate": 6.847906134642377e-05, "loss": 0.0067, "num_tokens": 3011536.0, "reward": -0.5202274322509766, "reward_std": 0.16256603598594666, "rewards/rollout_reward_func/mean": -0.5202274322509766, "rewards/rollout_reward_func/std": 0.1671036183834076, "sampling/importance_sampling_ratio/max": 1.277155876159668, "sampling/importance_sampling_ratio/mean": 1.000286340713501, "sampling/importance_sampling_ratio/min": 0.7637355923652649, "sampling/sampling_logp_difference/max": 0.269533634185791, "sampling/sampling_logp_difference/mean": 0.014094443060457706, "step": 97, "step_time": 19.571513411003252 }, { "clip_ratio/high_max": 0.026356960646808147, "clip_ratio/high_mean": 0.013178480323404074, "clip_ratio/low_mean": 0.029823482036590576, "clip_ratio/low_min": 0.004166666883975267, "clip_ratio/region_mean": 0.043001962127164006, "entropy": 0.3108792472630739, "epoch": 0.00098, "grad_norm": 0.2061738669872284, "kl": 0.6017032694071531, "learning_rate": 6.841281548844196e-05, "loss": 0.0022, "step": 98, "step_time": 4.486250934998679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 1063.0625, "completions/mean_terminated_length": 1063.0625, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "entropy": 0.291562894359231, "epoch": 0.00099, "frac_reward_zero_std": 0.0, "grad_norm": 0.2730512022972107, "kl": 0.4242094121873379, "learning_rate": 6.834519701456505e-05, "loss": -0.0044, "num_tokens": 3071159.0, "reward": -0.42122912406921387, "reward_std": 0.1824401617050171, "rewards/rollout_reward_func/mean": -0.42122912406921387, "rewards/rollout_reward_func/std": 0.21146075427532196, "sampling/importance_sampling_ratio/max": 1.2134770154953003, "sampling/importance_sampling_ratio/mean": 0.9990652203559875, "sampling/importance_sampling_ratio/min": 0.7035907506942749, "sampling/sampling_logp_difference/max": 0.3515584468841553, "sampling/sampling_logp_difference/mean": 0.014313156716525555, "step": 99, "step_time": 19.310070781999457 }, { "clip_ratio/high_max": 0.03779762051999569, "clip_ratio/high_mean": 0.023209155071526766, "clip_ratio/low_mean": 0.019191297702491283, "clip_ratio/low_min": 0.008928571827709675, "clip_ratio/region_mean": 0.04240045277401805, "entropy": 0.27814970910549164, "epoch": 0.001, "grad_norm": 0.22724440693855286, "kl": 0.423747755587101, "learning_rate": 6.827620922042207e-05, "loss": -0.0079, "step": 100, "step_time": 4.98858628799826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 1127.28125, "completions/mean_terminated_length": 1127.28125, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "entropy": 0.2535964362323284, "epoch": 0.00101, "frac_reward_zero_std": 0.0, "grad_norm": 0.2941298186779022, "kl": 0.29719405621290207, "learning_rate": 6.820585546838077e-05, "loss": -0.0013, "num_tokens": 3132837.0, "reward": -0.44322362542152405, "reward_std": 0.20939800143241882, "rewards/rollout_reward_func/mean": -0.44322362542152405, "rewards/rollout_reward_func/std": 0.20648106932640076, "sampling/importance_sampling_ratio/max": 1.150780439376831, "sampling/importance_sampling_ratio/mean": 1.0004308223724365, "sampling/importance_sampling_ratio/min": 0.7857727408409119, "sampling/sampling_logp_difference/max": 0.2410876750946045, "sampling/sampling_logp_difference/mean": 0.013219714164733887, "step": 101, "step_time": 20.358891846002734 }, { "clip_ratio/high_max": 0.012941297609359026, "clip_ratio/high_mean": 0.006470648804679513, "clip_ratio/low_mean": 0.026031405199319124, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.03250205423682928, "entropy": 0.22852065041661263, "epoch": 0.00102, "grad_norm": 0.18001848459243774, "kl": 0.33884071558713913, "learning_rate": 6.813413918738377e-05, "loss": -0.006, "step": 102, "step_time": 4.545632626999577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 1116.53125, "completions/mean_terminated_length": 1116.53125, "completions/min_length": 1046.0, "completions/min_terminated_length": 1046.0, "entropy": 0.20847422443330288, "epoch": 0.00103, "frac_reward_zero_std": 0.0, "grad_norm": 0.3878398537635803, "kl": 0.26171866059303284, "learning_rate": 6.806106387278142e-05, "loss": -0.0012, "num_tokens": 3194157.0, "reward": -0.46517664194107056, "reward_std": 0.22127576172351837, "rewards/rollout_reward_func/mean": -0.46517664194107056, "rewards/rollout_reward_func/std": 0.22515276074409485, "sampling/importance_sampling_ratio/max": 1.3875514268875122, "sampling/importance_sampling_ratio/mean": 0.9991943836212158, "sampling/importance_sampling_ratio/min": 0.7137066721916199, "sampling/sampling_logp_difference/max": 0.33728325366973877, "sampling/sampling_logp_difference/mean": 0.013238689862191677, "step": 103, "step_time": 19.242190885997843 }, { "clip_ratio/high_max": 0.025431035086512566, "clip_ratio/high_mean": 0.012715517543256283, "clip_ratio/low_mean": 0.017416416201740503, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.030131933512166142, "entropy": 0.19629098288714886, "epoch": 0.00104, "grad_norm": 0.23110534250736237, "kl": 0.27092156931757927, "learning_rate": 6.798663308616148e-05, "loss": -0.006, "step": 104, "step_time": 5.020551769999656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1230.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 1108.59375, "completions/mean_terminated_length": 1108.59375, "completions/min_length": 1034.0, "completions/min_terminated_length": 1034.0, "entropy": 0.1677666027098894, "epoch": 0.00105, "frac_reward_zero_std": 0.0, "grad_norm": 0.31530749797821045, "kl": 0.29294272884726524, "learning_rate": 6.791085045517547e-05, "loss": 0.0019, "num_tokens": 3255181.0, "reward": -0.49525174498558044, "reward_std": 0.3105778694152832, "rewards/rollout_reward_func/mean": -0.49525174498558044, "rewards/rollout_reward_func/std": 0.30169883370399475, "sampling/importance_sampling_ratio/max": 1.3028675317764282, "sampling/importance_sampling_ratio/mean": 1.0011165142059326, "sampling/importance_sampling_ratio/min": 0.8031312227249146, "sampling/sampling_logp_difference/max": 0.26456761360168457, "sampling/sampling_logp_difference/mean": 0.008162112906575203, "step": 105, "step_time": 20.15425903400319 }, { "clip_ratio/high_max": 0.012643678579479456, "clip_ratio/high_mean": 0.006321839289739728, "clip_ratio/low_mean": 0.017174671636894345, "clip_ratio/low_min": 0.008477011695504189, "clip_ratio/region_mean": 0.023496510926634073, "entropy": 0.15731988102197647, "epoch": 0.00106, "grad_norm": 0.14239747822284698, "kl": 0.2901698462665081, "learning_rate": 6.783371967336192e-05, "loss": -0.0021, "step": 106, "step_time": 4.521266695001032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 1122.34375, "completions/mean_terminated_length": 1122.34375, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "entropy": 0.17082856968045235, "epoch": 0.00107, "frac_reward_zero_std": 0.0, "grad_norm": 0.3113885521888733, "kl": 0.35370404459536076, "learning_rate": 6.775524449996631e-05, "loss": 0.0013, "num_tokens": 3316654.0, "reward": -0.3569881319999695, "reward_std": 0.2422994077205658, "rewards/rollout_reward_func/mean": -0.3569881319999695, "rewards/rollout_reward_func/std": 0.23960842192173004, "sampling/importance_sampling_ratio/max": 1.2748417854309082, "sampling/importance_sampling_ratio/mean": 0.9992586374282837, "sampling/importance_sampling_ratio/min": 0.587332546710968, "sampling/sampling_logp_difference/max": 0.5321640968322754, "sampling/sampling_logp_difference/mean": 0.011872734874486923, "step": 107, "step_time": 20.06445938300385 }, { "clip_ratio/high_max": 0.012500000651925802, "clip_ratio/high_mean": 0.008333333767950535, "clip_ratio/low_mean": 0.025584976421669126, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.03391831065528095, "entropy": 0.15917087346315384, "epoch": 0.00108, "grad_norm": 0.3104178011417389, "kl": 0.5849493406713009, "learning_rate": 6.767542875975789e-05, "loss": -0.0018, "step": 108, "step_time": 4.486651512997923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 1102.71875, "completions/mean_terminated_length": 1102.71875, "completions/min_length": 1016.0, "completions/min_terminated_length": 1016.0, "entropy": 0.1286757681518793, "epoch": 0.00109, "frac_reward_zero_std": 0.0, "grad_norm": 0.22844643890857697, "kl": 0.3761631529778242, "learning_rate": 6.759427634284321e-05, "loss": -0.0025, "num_tokens": 3377471.0, "reward": -0.42406848073005676, "reward_std": 0.125291109085083, "rewards/rollout_reward_func/mean": -0.42406848073005676, "rewards/rollout_reward_func/std": 0.12395402044057846, "sampling/importance_sampling_ratio/max": 1.223881721496582, "sampling/importance_sampling_ratio/mean": 0.9980927109718323, "sampling/importance_sampling_ratio/min": 0.7033082842826843, "sampling/sampling_logp_difference/max": 0.3519599437713623, "sampling/sampling_logp_difference/mean": 0.008846405893564224, "step": 109, "step_time": 20.91520320299969 }, { "clip_ratio/high_max": 0.012643678579479456, "clip_ratio/high_mean": 0.008405172731727362, "clip_ratio/low_mean": 0.01272064889781177, "clip_ratio/low_min": 0.004310344811528921, "clip_ratio/region_mean": 0.021125821629539132, "entropy": 0.12113876082003117, "epoch": 0.0011, "grad_norm": 0.13660430908203125, "kl": 0.3955628015100956, "learning_rate": 6.751179120447662e-05, "loss": -0.0062, "step": 110, "step_time": 4.475121453999236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 1102.53125, "completions/mean_terminated_length": 1102.53125, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "entropy": 0.11297394800931215, "epoch": 0.00111, "frac_reward_zero_std": 0.0, "grad_norm": 0.4004228115081787, "kl": 0.5922140702605247, "learning_rate": 6.742797736486737e-05, "loss": -0.0061, "num_tokens": 3438324.0, "reward": -0.4662068486213684, "reward_std": 0.23961836099624634, "rewards/rollout_reward_func/mean": -0.4662068486213684, "rewards/rollout_reward_func/std": 0.24006369709968567, "sampling/importance_sampling_ratio/max": 1.245427131652832, "sampling/importance_sampling_ratio/mean": 0.9992169141769409, "sampling/importance_sampling_ratio/min": 0.7873985171318054, "sampling/sampling_logp_difference/max": 0.23902082443237305, "sampling/sampling_logp_difference/mean": 0.008674157783389091, "step": 111, "step_time": 19.75388690899854 }, { "clip_ratio/high_max": 0.030028735753148794, "clip_ratio/high_mean": 0.017097701551392674, "clip_ratio/low_mean": 0.017565795686095953, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.034663497004657984, "entropy": 0.10719773638993502, "epoch": 0.00112, "grad_norm": 0.15587779879570007, "kl": 0.6039222273975611, "learning_rate": 6.734283890898376e-05, "loss": -0.0104, "step": 112, "step_time": 4.541389584001081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 1106.625, "completions/mean_terminated_length": 1106.625, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "entropy": 0.10979633685201406, "epoch": 0.00113, "frac_reward_zero_std": 0.0, "grad_norm": 0.4424666166305542, "kl": 0.405601616948843, "learning_rate": 6.725637998635402e-05, "loss": 0.0005, "num_tokens": 3499326.0, "reward": -0.37243300676345825, "reward_std": 0.2009376734495163, "rewards/rollout_reward_func/mean": -0.37243300676345825, "rewards/rollout_reward_func/std": 0.2171618789434433, "sampling/importance_sampling_ratio/max": 1.4405823945999146, "sampling/importance_sampling_ratio/mean": 0.9954735636711121, "sampling/importance_sampling_ratio/min": 0.5101393461227417, "sampling/sampling_logp_difference/max": 0.6730713844299316, "sampling/sampling_logp_difference/mean": 0.015605228021740913, "step": 113, "step_time": 19.52763991699976 }, { "clip_ratio/high_max": 0.021551724057644606, "clip_ratio/high_mean": 0.01494252891279757, "clip_ratio/low_mean": 0.019037357065826654, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.03397988621145487, "entropy": 0.10580410901457071, "epoch": 0.00114, "grad_norm": 0.20038853585720062, "kl": 0.4876371771097183, "learning_rate": 6.716860481086407e-05, "loss": -0.0026, "step": 114, "step_time": 5.008007214999452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1165.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 1093.59375, "completions/mean_terminated_length": 1093.59375, "completions/min_length": 1019.0, "completions/min_terminated_length": 1019.0, "entropy": 0.12992096226662397, "epoch": 0.00115, "frac_reward_zero_std": 0.0, "grad_norm": 0.3423241078853607, "kl": 0.32542602345347404, "learning_rate": 6.707951766055213e-05, "loss": -0.0008, "num_tokens": 3559825.0, "reward": -0.2904287874698639, "reward_std": 0.21366259455680847, "rewards/rollout_reward_func/mean": -0.2904287874698639, "rewards/rollout_reward_func/std": 0.2277645468711853, "sampling/importance_sampling_ratio/max": 1.3079499006271362, "sampling/importance_sampling_ratio/mean": 1.000572681427002, "sampling/importance_sampling_ratio/min": 0.7608344554901123, "sampling/sampling_logp_difference/max": 0.27333950996398926, "sampling/sampling_logp_difference/mean": 0.009929574094712734, "step": 115, "step_time": 19.9443882329997 }, { "clip_ratio/high_max": 0.04195402469485998, "clip_ratio/high_mean": 0.02097701234742999, "clip_ratio/low_mean": 0.010560345137491822, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03153735748492181, "entropy": 0.1283034523949027, "epoch": 0.00116, "grad_norm": 0.2063482254743576, "kl": 0.3354860246181488, "learning_rate": 6.698912287740021e-05, "loss": -0.0055, "step": 116, "step_time": 4.394176279000021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 1116.125, "completions/mean_terminated_length": 1116.125, "completions/min_length": 1044.0, "completions/min_terminated_length": 1044.0, "entropy": 0.10509093943983316, "epoch": 0.00117, "frac_reward_zero_std": 0.0, "grad_norm": 0.2746105492115021, "kl": 0.37556229531764984, "learning_rate": 6.689742486712253e-05, "loss": 0.0045, "num_tokens": 3621115.0, "reward": -0.35871076583862305, "reward_std": 0.26629638671875, "rewards/rollout_reward_func/mean": -0.35871076583862305, "rewards/rollout_reward_func/std": 0.2609623074531555, "sampling/importance_sampling_ratio/max": 1.2699754238128662, "sampling/importance_sampling_ratio/mean": 0.9998886585235596, "sampling/importance_sampling_ratio/min": 0.6173349618911743, "sampling/sampling_logp_difference/max": 0.4823434352874756, "sampling/sampling_logp_difference/mean": 0.010066376999020576, "step": 117, "step_time": 19.912768575999507 }, { "clip_ratio/high_max": 0.02097701234742999, "clip_ratio/high_mean": 0.012571839615702629, "clip_ratio/low_mean": 0.023204024182632565, "clip_ratio/low_min": 0.01666666753590107, "clip_ratio/region_mean": 0.03577586356550455, "entropy": 0.09517586696892977, "epoch": 0.00118, "grad_norm": 0.15435738861560822, "kl": 0.40376710519194603, "learning_rate": 6.680442809895073e-05, "loss": 0.001, "step": 118, "step_time": 4.419890868999573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 1098.15625, "completions/mean_terminated_length": 1098.15625, "completions/min_length": 1034.0, "completions/min_terminated_length": 1034.0, "entropy": 0.08835013723000884, "epoch": 0.00119, "frac_reward_zero_std": 0.0, "grad_norm": 0.3324456810951233, "kl": 0.36314607411623, "learning_rate": 6.671013710541611e-05, "loss": 0.0007, "num_tokens": 3681803.0, "reward": -0.3065405786037445, "reward_std": 0.2237951010465622, "rewards/rollout_reward_func/mean": -0.3065405786037445, "rewards/rollout_reward_func/std": 0.21822023391723633, "sampling/importance_sampling_ratio/max": 1.1267787218093872, "sampling/importance_sampling_ratio/mean": 0.9975160956382751, "sampling/importance_sampling_ratio/min": 0.6554495096206665, "sampling/sampling_logp_difference/max": 0.4224339723587036, "sampling/sampling_logp_difference/mean": 0.009264478459954262, "step": 119, "step_time": 19.809869228995012 }, { "clip_ratio/high_max": 0.025728654116392136, "clip_ratio/high_mean": 0.012864327058196068, "clip_ratio/low_mean": 0.02737582172267139, "clip_ratio/low_min": 0.016810345463454723, "clip_ratio/region_mean": 0.04024014878086746, "entropy": 0.07488261116668582, "epoch": 0.0012, "grad_norm": 0.25195708870887756, "kl": 0.4577188976109028, "learning_rate": 6.661455648212863e-05, "loss": -0.0019, "step": 120, "step_time": 4.38998275099766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1555.0, "completions/max_terminated_length": 1555.0, "completions/mean_length": 1390.0, "completions/mean_terminated_length": 1390.0, "completions/min_length": 1282.0, "completions/min_terminated_length": 1282.0, "entropy": 0.07240608939900994, "epoch": 0.00121, "frac_reward_zero_std": 0.0, "grad_norm": 0.47087979316711426, "kl": 0.3365720082074404, "learning_rate": 6.651769088755307e-05, "loss": 0.0011, "num_tokens": 3751911.0, "reward": -0.3373774588108063, "reward_std": 0.2850075364112854, "rewards/rollout_reward_func/mean": -0.3373774588108063, "rewards/rollout_reward_func/std": 0.2947815954685211, "sampling/importance_sampling_ratio/max": 1.5862010717391968, "sampling/importance_sampling_ratio/mean": 1.0028750896453857, "sampling/importance_sampling_ratio/min": 0.51273113489151, "sampling/sampling_logp_difference/max": 0.6680036783218384, "sampling/sampling_logp_difference/mean": 0.009058910422027111, "step": 121, "step_time": 21.313163071001327 }, { "clip_ratio/high_max": 0.010515873087570071, "clip_ratio/high_mean": 0.005257936543785036, "clip_ratio/low_mean": 0.014087301678955555, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.01934523822274059, "entropy": 0.06431235792115331, "epoch": 0.00122, "grad_norm": 0.24179181456565857, "kl": 0.4233996532857418, "learning_rate": 6.641954504278184e-05, "loss": -0.0019, "step": 122, "step_time": 5.2943772620001255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1538.0, "completions/max_terminated_length": 1538.0, "completions/mean_length": 1374.96875, "completions/mean_terminated_length": 1374.96875, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "entropy": 0.0671513439156115, "epoch": 0.00123, "frac_reward_zero_std": 0.0, "grad_norm": 0.507310688495636, "kl": 0.4749574400484562, "learning_rate": 6.632012373130493e-05, "loss": -0.0029, "num_tokens": 3821477.0, "reward": -0.3270702362060547, "reward_std": 0.23897257447242737, "rewards/rollout_reward_func/mean": -0.3270702362060547, "rewards/rollout_reward_func/std": 0.2439153492450714, "sampling/importance_sampling_ratio/max": 1.8515390157699585, "sampling/importance_sampling_ratio/mean": 1.0025148391723633, "sampling/importance_sampling_ratio/min": 0.620826780796051, "sampling/sampling_logp_difference/max": 0.6160171627998352, "sampling/sampling_logp_difference/mean": 0.010109664872288704, "step": 123, "step_time": 22.798276609000823 }, { "clip_ratio/high_max": 0.017775974236428738, "clip_ratio/high_mean": 0.010624098242260516, "clip_ratio/low_mean": 0.022619047900661826, "clip_ratio/low_min": 0.0069444444961845875, "clip_ratio/region_mean": 0.033243146375752985, "entropy": 0.06234947498887777, "epoch": 0.00124, "grad_norm": 0.2385358065366745, "kl": 0.43203799426555634, "learning_rate": 6.621943179877683e-05, "loss": -0.0061, "step": 124, "step_time": 6.268182998999691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 1396.59375, "completions/mean_terminated_length": 1396.59375, "completions/min_length": 1273.0, "completions/min_terminated_length": 1273.0, "entropy": 0.05260535189881921, "epoch": 0.00125, "frac_reward_zero_std": 0.0, "grad_norm": 0.8618159294128418, "kl": 0.6789176482707262, "learning_rate": 6.611747415278026e-05, "loss": 0.0043, "num_tokens": 3891742.0, "reward": -0.34599587321281433, "reward_std": 0.18537992238998413, "rewards/rollout_reward_func/mean": -0.34599587321281433, "rewards/rollout_reward_func/std": 0.1989048272371292, "sampling/importance_sampling_ratio/max": 1.5482592582702637, "sampling/importance_sampling_ratio/mean": 0.9963085055351257, "sampling/importance_sampling_ratio/min": 0.44325098395347595, "sampling/sampling_logp_difference/max": 0.8136191368103027, "sampling/sampling_logp_difference/mean": 0.01058841124176979, "step": 125, "step_time": 22.268644142000994 }, { "clip_ratio/high_max": 0.024404762079939246, "clip_ratio/high_mean": 0.013938492280431092, "clip_ratio/low_mean": 0.01393849216401577, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027876984677277505, "entropy": 0.0533214220777154, "epoch": 0.00126, "grad_norm": 0.22716756165027618, "kl": 0.5420177187770605, "learning_rate": 6.601425576258705e-05, "loss": 0.0005, "step": 126, "step_time": 5.361977264998131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 1371.09375, "completions/mean_terminated_length": 1371.09375, "completions/min_length": 1049.0, "completions/min_terminated_length": 1049.0, "entropy": 0.06317630130797625, "epoch": 0.00127, "frac_reward_zero_std": 0.0, "grad_norm": 0.4571119248867035, "kl": 0.6214037910103798, "learning_rate": 6.59097816589159e-05, "loss": 0.0055, "num_tokens": 3961173.0, "reward": -0.3198418915271759, "reward_std": 0.22759833931922913, "rewards/rollout_reward_func/mean": -0.3198418915271759, "rewards/rollout_reward_func/std": 0.23395206034183502, "sampling/importance_sampling_ratio/max": 1.6126519441604614, "sampling/importance_sampling_ratio/mean": 1.0020852088928223, "sampling/importance_sampling_ratio/min": 0.39678603410720825, "sampling/sampling_logp_difference/max": 0.9243581295013428, "sampling/sampling_logp_difference/mean": 0.012377513572573662, "step": 127, "step_time": 21.32312002699473 }, { "clip_ratio/high_max": 0.017361111473292112, "clip_ratio/high_mean": 0.010416666744276881, "clip_ratio/low_mean": 0.01934523822274059, "clip_ratio/low_min": 0.01736111124046147, "clip_ratio/region_mean": 0.029761904967017472, "entropy": 0.07054613903164864, "epoch": 0.00128, "grad_norm": 0.20527607202529907, "kl": 0.46914724446833134, "learning_rate": 6.580405693368725e-05, "loss": 0.0002, "step": 128, "step_time": 5.75660438399791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 1347.84375, "completions/mean_terminated_length": 1347.84375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.07165835704654455, "epoch": 0.00129, "frac_reward_zero_std": 0.0, "grad_norm": 0.4203159213066101, "kl": 0.37369544990360737, "learning_rate": 6.569708673977508e-05, "loss": -0.0094, "num_tokens": 4029953.0, "reward": -0.44866958260536194, "reward_std": 0.2745576798915863, "rewards/rollout_reward_func/mean": -0.44866958260536194, "rewards/rollout_reward_func/std": 0.3031750023365021, "sampling/importance_sampling_ratio/max": 1.2035694122314453, "sampling/importance_sampling_ratio/mean": 1.0014601945877075, "sampling/importance_sampling_ratio/min": 0.5930081009864807, "sampling/sampling_logp_difference/max": 0.5225472450256348, "sampling/sampling_logp_difference/mean": 0.006503256969153881, "step": 129, "step_time": 20.978233070003625 }, { "clip_ratio/high_max": 0.021130952518433332, "clip_ratio/high_mean": 0.010565476259216666, "clip_ratio/low_mean": 0.015674603288061917, "clip_ratio/low_min": 0.0069444444961845875, "clip_ratio/region_mean": 0.026240079663693905, "entropy": 0.07722306018695235, "epoch": 0.0013, "grad_norm": 0.1791069507598877, "kl": 0.36845323350280523, "learning_rate": 6.558887629075567e-05, "loss": -0.0113, "step": 130, "step_time": 5.266955766999672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 1399.25, "completions/mean_terminated_length": 1399.25, "completions/min_length": 1301.0, "completions/min_terminated_length": 1301.0, "entropy": 0.10102262534201145, "epoch": 0.00131, "frac_reward_zero_std": 0.0, "grad_norm": 0.4758385419845581, "kl": 0.4525767043232918, "learning_rate": 6.547943086065366e-05, "loss": 0.0035, "num_tokens": 4100360.0, "reward": -0.382585346698761, "reward_std": 0.21716874837875366, "rewards/rollout_reward_func/mean": -0.382585346698761, "rewards/rollout_reward_func/std": 0.21535179018974304, "sampling/importance_sampling_ratio/max": 1.4899823665618896, "sampling/importance_sampling_ratio/mean": 1.0012757778167725, "sampling/importance_sampling_ratio/min": 0.707645833492279, "sampling/sampling_logp_difference/max": 0.3987642526626587, "sampling/sampling_logp_difference/mean": 0.009505197405815125, "step": 131, "step_time": 21.907281787998727 }, { "clip_ratio/high_max": 0.024603174766525626, "clip_ratio/high_mean": 0.01577380974777043, "clip_ratio/low_mean": 0.02281746093649417, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03859127080067992, "entropy": 0.10532093420624733, "epoch": 0.00132, "grad_norm": 0.2519294321537018, "kl": 0.48891404271125793, "learning_rate": 6.536875578368495e-05, "loss": -0.0011, "step": 132, "step_time": 5.424237158998949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 1347.15625, "completions/mean_terminated_length": 1347.15625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.0834057885222137, "epoch": 0.00133, "frac_reward_zero_std": 0.0, "grad_norm": 0.38942208886146545, "kl": 0.32190256752073765, "learning_rate": 6.52568564539966e-05, "loss": -0.0232, "num_tokens": 4169061.0, "reward": -0.39113572239875793, "reward_std": 0.3298783004283905, "rewards/rollout_reward_func/mean": -0.39113572239875793, "rewards/rollout_reward_func/std": 0.35657551884651184, "sampling/importance_sampling_ratio/max": 1.250070571899414, "sampling/importance_sampling_ratio/mean": 0.9978351593017578, "sampling/importance_sampling_ratio/min": 0.5548729300498962, "sampling/sampling_logp_difference/max": 0.5890161991119385, "sampling/sampling_logp_difference/mean": 0.008746784180402756, "step": 133, "step_time": 22.77548555100111 }, { "clip_ratio/high_max": 0.01756535959430039, "clip_ratio/high_mean": 0.012254902045242488, "clip_ratio/low_mean": 0.01225490216165781, "clip_ratio/low_min": 0.007148692850023508, "clip_ratio/region_mean": 0.024509804090484977, "entropy": 0.0858388589695096, "epoch": 0.00134, "grad_norm": 0.2608133554458618, "kl": 0.55234544724226, "learning_rate": 6.514373832540411e-05, "loss": -0.0242, "step": 134, "step_time": 5.300755439000568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1505.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 1359.875, "completions/mean_terminated_length": 1359.875, "completions/min_length": 1116.0, "completions/min_terminated_length": 1116.0, "entropy": 0.09293266385793686, "epoch": 0.00135, "frac_reward_zero_std": 0.0, "grad_norm": 0.2756594717502594, "kl": 0.3760007284581661, "learning_rate": 6.502940691112546e-05, "loss": 0.0032, "num_tokens": 4238091.0, "reward": -0.3124757409095764, "reward_std": 0.2979466915130615, "rewards/rollout_reward_func/mean": -0.3124757409095764, "rewards/rollout_reward_func/std": 0.32064518332481384, "sampling/importance_sampling_ratio/max": 1.3123159408569336, "sampling/importance_sampling_ratio/mean": 1.0001496076583862, "sampling/importance_sampling_ratio/min": 0.7856245040893555, "sampling/sampling_logp_difference/max": 0.2717934846878052, "sampling/sampling_logp_difference/mean": 0.007447673007845879, "step": 135, "step_time": 22.26289443799942 }, { "clip_ratio/high_max": 0.010515873087570071, "clip_ratio/high_mean": 0.005257936543785036, "clip_ratio/low_mean": 0.010565476259216666, "clip_ratio/low_min": 0.0070436508394777775, "clip_ratio/region_mean": 0.015823412919417024, "entropy": 0.09459283109754324, "epoch": 0.00136, "grad_norm": 0.1941918432712555, "kl": 0.37206556275486946, "learning_rate": 6.491386778351248e-05, "loss": 0.0011, "step": 136, "step_time": 5.174287300003925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1558.0, "completions/max_terminated_length": 1558.0, "completions/mean_length": 1385.59375, "completions/mean_terminated_length": 1385.59375, "completions/min_length": 1140.0, "completions/min_terminated_length": 1140.0, "entropy": 0.10758042987436056, "epoch": 0.00137, "frac_reward_zero_std": 0.0, "grad_norm": 0.4516114592552185, "kl": 0.4972536042332649, "learning_rate": 6.479712657377927e-05, "loss": 0.0033, "num_tokens": 4308005.0, "reward": -0.39466363191604614, "reward_std": 0.23855216801166534, "rewards/rollout_reward_func/mean": -0.39466363191604614, "rewards/rollout_reward_func/std": 0.2420763075351715, "sampling/importance_sampling_ratio/max": 1.394841194152832, "sampling/importance_sampling_ratio/mean": 1.0001493692398071, "sampling/importance_sampling_ratio/min": 0.34221550822257996, "sampling/sampling_logp_difference/max": 1.0723146200180054, "sampling/sampling_logp_difference/mean": 0.009577883407473564, "step": 137, "step_time": 22.388965665000796 }, { "clip_ratio/high_max": 0.01746031758375466, "clip_ratio/high_mean": 0.00873015879187733, "clip_ratio/low_mean": 0.03170810057781637, "clip_ratio/low_min": 0.013888888992369175, "clip_ratio/region_mean": 0.040438259835354984, "entropy": 0.10538582410663366, "epoch": 0.00138, "grad_norm": 0.9839510917663574, "kl": 1.1291486509144306, "learning_rate": 6.467918897172769e-05, "loss": 0.0031, "step": 138, "step_time": 5.798211176001132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 1365.78125, "completions/mean_terminated_length": 1365.78125, "completions/min_length": 1284.0, "completions/min_terminated_length": 1284.0, "entropy": 0.13115237560123205, "epoch": 0.00139, "frac_reward_zero_std": 0.0, "grad_norm": 0.45241695642471313, "kl": 0.4629448987543583, "learning_rate": 6.456006072547007e-05, "loss": 0.001, "num_tokens": 4377290.0, "reward": -0.4417785406112671, "reward_std": 0.21920713782310486, "rewards/rollout_reward_func/mean": -0.4417785406112671, "rewards/rollout_reward_func/std": 0.2673465609550476, "sampling/importance_sampling_ratio/max": 1.234144687652588, "sampling/importance_sampling_ratio/mean": 0.9994381666183472, "sampling/importance_sampling_ratio/min": 0.6717762351036072, "sampling/sampling_logp_difference/max": 0.3978300094604492, "sampling/sampling_logp_difference/mean": 0.012139089405536652, "step": 139, "step_time": 21.66717720899942 }, { "clip_ratio/high_max": 0.02808123268187046, "clip_ratio/high_mean": 0.015776727464981377, "clip_ratio/low_mean": 0.02118639147374779, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03696311928797513, "entropy": 0.1275120321661234, "epoch": 0.0014, "grad_norm": 0.1970338523387909, "kl": 0.48472846671938896, "learning_rate": 6.443974764114906e-05, "loss": -0.0035, "step": 140, "step_time": 5.336225918001219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 1347.9375, "completions/mean_terminated_length": 1345.3548583984375, "completions/min_length": 1220.0, "completions/min_terminated_length": 1220.0, "entropy": 0.15396588202565908, "epoch": 0.00141, "frac_reward_zero_std": 0.0, "grad_norm": 1.0424706935882568, "kl": 0.6201082356274128, "learning_rate": 6.431825558265468e-05, "loss": -0.1121, "num_tokens": 4445957.0, "reward": -0.39346328377723694, "reward_std": 0.22894302010536194, "rewards/rollout_reward_func/mean": -0.39346328377723694, "rewards/rollout_reward_func/std": 0.23490974307060242, "sampling/importance_sampling_ratio/max": 2.7976858615875244, "sampling/importance_sampling_ratio/mean": 1.0046974420547485, "sampling/importance_sampling_ratio/min": 0.6484396457672119, "sampling/sampling_logp_difference/max": 1.0287926197052002, "sampling/sampling_logp_difference/mean": 0.014869032427668571, "step": 141, "step_time": 22.04462256300394 }, { "clip_ratio/high_max": 0.028812056872993708, "clip_ratio/high_mean": 0.014406028436496854, "clip_ratio/low_mean": 0.018224417697638273, "clip_ratio/low_min": 0.004802009440027177, "clip_ratio/region_mean": 0.03263044636696577, "entropy": 0.1560619967058301, "epoch": 0.00142, "grad_norm": 0.47388795018196106, "kl": 0.6491764895617962, "learning_rate": 6.419559047133844e-05, "loss": -0.1185, "step": 142, "step_time": 5.3357356670021545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 1386.15625, "completions/mean_terminated_length": 1386.15625, "completions/min_length": 1206.0, "completions/min_terminated_length": 1206.0, "entropy": 0.153810178861022, "epoch": 0.00143, "frac_reward_zero_std": 0.0, "grad_norm": 0.5296468734741211, "kl": 0.5138975828886032, "learning_rate": 6.407175828572481e-05, "loss": 0.0046, "num_tokens": 4515878.0, "reward": -0.5492601990699768, "reward_std": 0.26385554671287537, "rewards/rollout_reward_func/mean": -0.5492601990699768, "rewards/rollout_reward_func/std": 0.2555752098560333, "sampling/importance_sampling_ratio/max": 1.1902220249176025, "sampling/importance_sampling_ratio/mean": 0.9977631568908691, "sampling/importance_sampling_ratio/min": 0.5102993845939636, "sampling/sampling_logp_difference/max": 0.6727576851844788, "sampling/sampling_logp_difference/mean": 0.010046977549791336, "step": 143, "step_time": 22.707583329001864 }, { "clip_ratio/high_max": 0.027882346883416176, "clip_ratio/high_mean": 0.013941173441708088, "clip_ratio/low_mean": 0.017413395922631025, "clip_ratio/low_min": 0.006850600708276033, "clip_ratio/region_mean": 0.031354569480754435, "entropy": 0.15916954539716244, "epoch": 0.00144, "grad_norm": 0.2565109431743622, "kl": 0.46902178041636944, "learning_rate": 6.394676506121983e-05, "loss": 0.0001, "step": 144, "step_time": 5.331718129002184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 1331.125, "completions/mean_terminated_length": 1331.125, "completions/min_length": 1076.0, "completions/min_terminated_length": 1076.0, "entropy": 0.12926697731018066, "epoch": 0.00145, "frac_reward_zero_std": 0.0, "grad_norm": 0.3901028335094452, "kl": 0.5369635224342346, "learning_rate": 6.382061688981692e-05, "loss": -0.0077, "num_tokens": 4583960.0, "reward": -0.45125412940979004, "reward_std": 0.23216669261455536, "rewards/rollout_reward_func/mean": -0.45125412940979004, "rewards/rollout_reward_func/std": 0.23907002806663513, "sampling/importance_sampling_ratio/max": 1.148919701576233, "sampling/importance_sampling_ratio/mean": 0.9978731870651245, "sampling/importance_sampling_ratio/min": 0.744191586971283, "sampling/sampling_logp_difference/max": 0.29545676708221436, "sampling/sampling_logp_difference/mean": 0.008175171911716461, "step": 145, "step_time": 21.63189383399913 }, { "clip_ratio/high_max": 0.04072039155289531, "clip_ratio/high_mean": 0.022049385006539524, "clip_ratio/low_mean": 0.012205062434077263, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.034254447906278074, "entropy": 0.12047993252053857, "epoch": 0.00146, "grad_norm": 0.2503841817378998, "kl": 0.5871033668518066, "learning_rate": 6.369331991979996e-05, "loss": -0.0104, "step": 146, "step_time": 5.140120815003684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1360.25, "completions/mean_terminated_length": 1360.25, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.18231217144057155, "epoch": 0.00147, "frac_reward_zero_std": 0.0, "grad_norm": 0.5000494122505188, "kl": 0.3411352690309286, "learning_rate": 6.356488035544373e-05, "loss": 0.0553, "num_tokens": 4653092.0, "reward": -0.30473053455352783, "reward_std": 0.16996338963508606, "rewards/rollout_reward_func/mean": -0.30473053455352783, "rewards/rollout_reward_func/std": 0.21416036784648895, "sampling/importance_sampling_ratio/max": 1.300194501876831, "sampling/importance_sampling_ratio/mean": 0.9973474740982056, "sampling/importance_sampling_ratio/min": 0.7909995317459106, "sampling/sampling_logp_difference/max": 0.2625138759613037, "sampling/sampling_logp_difference/mean": 0.009243374690413475, "step": 147, "step_time": 21.772466440001153 }, { "clip_ratio/high_max": 0.0070436508394777775, "clip_ratio/high_mean": 0.005257936543785036, "clip_ratio/low_mean": 0.018345503718592227, "clip_ratio/low_min": 0.0069444444961845875, "clip_ratio/region_mean": 0.023603440262377262, "entropy": 0.17393829533830285, "epoch": 0.00148, "grad_norm": 0.3159714639186859, "kl": 0.3571876045316458, "learning_rate": 6.343530445671135e-05, "loss": 0.0465, "step": 148, "step_time": 6.280698637003297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 1342.28125, "completions/mean_terminated_length": 1338.6129150390625, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "entropy": 0.10892681032419205, "epoch": 0.00149, "frac_reward_zero_std": 0.0, "grad_norm": 0.5067455172538757, "kl": 0.47550987266004086, "learning_rate": 6.330459853894934e-05, "loss": 0.0039, "num_tokens": 4721594.0, "reward": -0.35202786326408386, "reward_std": 0.28926119208335876, "rewards/rollout_reward_func/mean": -0.35202786326408386, "rewards/rollout_reward_func/std": 0.2914915084838867, "sampling/importance_sampling_ratio/max": 1.798542857170105, "sampling/importance_sampling_ratio/mean": 0.9994502663612366, "sampling/importance_sampling_ratio/min": 0.6887320876121521, "sampling/sampling_logp_difference/max": 0.5869767665863037, "sampling/sampling_logp_difference/mean": 0.00826427061110735, "step": 149, "step_time": 21.666677493996758 }, { "clip_ratio/high_max": 0.014396592043340206, "clip_ratio/high_mean": 0.009281629463657737, "clip_ratio/low_mean": 0.016785926010925323, "clip_ratio/low_min": 0.010714285774156451, "clip_ratio/region_mean": 0.02606755559099838, "entropy": 0.09551453217864037, "epoch": 0.0015, "grad_norm": 0.17375075817108154, "kl": 0.4516577925533056, "learning_rate": 6.317276897257973e-05, "loss": 0.0007, "step": 150, "step_time": 5.312037860998316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 1365.34375, "completions/mean_terminated_length": 1365.34375, "completions/min_length": 1198.0, "completions/min_terminated_length": 1198.0, "entropy": 0.06328308139927685, "epoch": 0.00151, "frac_reward_zero_std": 0.0, "grad_norm": 0.5383778810501099, "kl": 1.2362031117081642, "learning_rate": 6.303982218278959e-05, "loss": -0.0036, "num_tokens": 4790817.0, "reward": -0.3593817353248596, "reward_std": 0.34038400650024414, "rewards/rollout_reward_func/mean": -0.3593817353248596, "rewards/rollout_reward_func/std": 0.3953782916069031, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 1.0000388622283936, "sampling/importance_sampling_ratio/min": 0.4184833765029907, "sampling/sampling_logp_difference/max": 1.2964458465576172, "sampling/sampling_logp_difference/mean": 0.011229651048779488, "step": 151, "step_time": 20.752377626999078 }, { "clip_ratio/high_max": 0.010615079430863261, "clip_ratio/high_mean": 0.005307539715431631, "clip_ratio/low_mean": 0.017921335413120687, "clip_ratio/low_min": 0.0069444444961845875, "clip_ratio/region_mean": 0.02322887524496764, "entropy": 0.06401210278272629, "epoch": 0.00152, "grad_norm": 0.15578021109104156, "kl": 0.6527968980371952, "learning_rate": 6.290576464921792e-05, "loss": -0.0085, "step": 152, "step_time": 5.739053225002863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 1334.0, "completions/mean_terminated_length": 1334.0, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "entropy": 0.07145139575004578, "epoch": 0.00153, "frac_reward_zero_std": 0.0, "grad_norm": 0.5472925901412964, "kl": 0.36226576566696167, "learning_rate": 6.277060290563974e-05, "loss": 0.0015, "num_tokens": 4859028.0, "reward": -0.3873451352119446, "reward_std": 0.25160253047943115, "rewards/rollout_reward_func/mean": -0.3873451352119446, "rewards/rollout_reward_func/std": 0.25160717964172363, "sampling/importance_sampling_ratio/max": 1.6815582513809204, "sampling/importance_sampling_ratio/mean": 1.0013703107833862, "sampling/importance_sampling_ratio/min": 0.678411066532135, "sampling/sampling_logp_difference/max": 0.5197209119796753, "sampling/sampling_logp_difference/mean": 0.008667878806591034, "step": 153, "step_time": 21.37804413300364 }, { "clip_ratio/high_max": 0.01746031758375466, "clip_ratio/high_mean": 0.010088854469358921, "clip_ratio/low_mean": 0.018819013377651572, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02890786819625646, "entropy": 0.07074933312833309, "epoch": 0.00154, "grad_norm": 0.155680313706398, "kl": 0.376104474067688, "learning_rate": 6.263434353964779e-05, "loss": -0.0012, "step": 154, "step_time": 5.17632765900089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 1370.71875, "completions/mean_terminated_length": 1370.71875, "completions/min_length": 1244.0, "completions/min_terminated_length": 1244.0, "entropy": 0.09461641125380993, "epoch": 0.00155, "frac_reward_zero_std": 0.0, "grad_norm": 0.5061172842979431, "kl": 0.6669844202697277, "learning_rate": 6.249699319233132e-05, "loss": 0.0044, "num_tokens": 4928452.0, "reward": -0.39461788535118103, "reward_std": 0.3296222686767578, "rewards/rollout_reward_func/mean": -0.39461788535118103, "rewards/rollout_reward_func/std": 0.36830776929855347, "sampling/importance_sampling_ratio/max": 2.403796672821045, "sampling/importance_sampling_ratio/mean": 1.0003888607025146, "sampling/importance_sampling_ratio/min": 0.5154057145118713, "sampling/sampling_logp_difference/max": 0.877049446105957, "sampling/sampling_logp_difference/mean": 0.011522997170686722, "step": 155, "step_time": 21.202050380999935 }, { "clip_ratio/high_max": 0.016310987761244178, "clip_ratio/high_mean": 0.008155493880622089, "clip_ratio/low_mean": 0.010568394092842937, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.018723888089880347, "entropy": 0.09179406659677625, "epoch": 0.00156, "grad_norm": 0.2919463515281677, "kl": 0.7908302694559097, "learning_rate": 6.235855855795248e-05, "loss": 0.0031, "step": 156, "step_time": 5.297240288999092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 1357.9375, "completions/mean_terminated_length": 1357.9375, "completions/min_length": 1018.0, "completions/min_terminated_length": 1018.0, "entropy": 0.0734132076613605, "epoch": 0.00157, "frac_reward_zero_std": 0.0, "grad_norm": 0.8622139692306519, "kl": 0.39381586574018, "learning_rate": 6.221904638362004e-05, "loss": -0.0004, "num_tokens": 4997474.0, "reward": -0.318034827709198, "reward_std": 0.22524529695510864, "rewards/rollout_reward_func/mean": -0.318034827709198, "rewards/rollout_reward_func/std": 0.2321600466966629, "sampling/importance_sampling_ratio/max": 1.7109562158584595, "sampling/importance_sampling_ratio/mean": 0.9992145895957947, "sampling/importance_sampling_ratio/min": 0.41737428307533264, "sampling/sampling_logp_difference/max": 0.8737719058990479, "sampling/sampling_logp_difference/mean": 0.011989613063633442, "step": 157, "step_time": 22.465031565998288 }, { "clip_ratio/high_max": 0.025135281728580594, "clip_ratio/high_mean": 0.014303751988336444, "clip_ratio/low_mean": 0.013938492280431092, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028242244850844145, "entropy": 0.06748128402978182, "epoch": 0.00158, "grad_norm": 0.5649423003196716, "kl": 0.5091204904019833, "learning_rate": 6.207846346896057e-05, "loss": 0.0039, "step": 158, "step_time": 5.326649410997561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 1359.1875, "completions/mean_terminated_length": 1359.1875, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "entropy": 0.07645656424574554, "epoch": 0.00159, "frac_reward_zero_std": 0.0, "grad_norm": 1.0044221878051758, "kl": 0.5330846793949604, "learning_rate": 6.1936816665787e-05, "loss": -0.0004, "num_tokens": 5066523.0, "reward": -0.392457515001297, "reward_std": 0.3343449831008911, "rewards/rollout_reward_func/mean": -0.392457515001297, "rewards/rollout_reward_func/std": 0.3482252359390259, "sampling/importance_sampling_ratio/max": 1.7241829633712769, "sampling/importance_sampling_ratio/mean": 1.0009022951126099, "sampling/importance_sampling_ratio/min": 0.7782901525497437, "sampling/sampling_logp_difference/max": 0.5447533130645752, "sampling/sampling_logp_difference/mean": 0.007528146728873253, "step": 159, "step_time": 21.646616375999656 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.01340253569651395, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.01861086906865239, "entropy": 0.07297475403174758, "epoch": 0.0016, "grad_norm": 0.13396874070167542, "kl": 0.5128170438110828, "learning_rate": 6.179411287776466e-05, "loss": -0.0018, "step": 160, "step_time": 5.380177323999305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 1373.46875, "completions/mean_terminated_length": 1373.46875, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "entropy": 0.05619392590597272, "epoch": 0.00161, "frac_reward_zero_std": 0.0, "grad_norm": 0.5769148468971252, "kl": 0.4363054446876049, "learning_rate": 6.165035906007487e-05, "loss": -0.0081, "num_tokens": 5136078.0, "reward": -0.36155906319618225, "reward_std": 0.20524969696998596, "rewards/rollout_reward_func/mean": -0.36155906319618225, "rewards/rollout_reward_func/std": 0.21293611824512482, "sampling/importance_sampling_ratio/max": 1.4475702047348022, "sampling/importance_sampling_ratio/mean": 0.9991693496704102, "sampling/importance_sampling_ratio/min": 0.5885672569274902, "sampling/sampling_logp_difference/max": 0.5300641059875488, "sampling/sampling_logp_difference/mean": 0.008272118866443634, "step": 161, "step_time": 20.998420776002604 }, { "clip_ratio/high_max": 0.030884503154084086, "clip_ratio/high_mean": 0.017227965756319463, "clip_ratio/low_mean": 0.010934934369288385, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028162900125607848, "entropy": 0.058541097678244114, "epoch": 0.00162, "grad_norm": 0.1596163809299469, "kl": 0.4523899294435978, "learning_rate": 6.150556221907589e-05, "loss": -0.0108, "step": 162, "step_time": 6.255413859002147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 1408.40625, "completions/mean_terminated_length": 1408.40625, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "entropy": 0.06392064830288291, "epoch": 0.00163, "frac_reward_zero_std": 0.0, "grad_norm": 0.39812105894088745, "kl": 0.5187443308532238, "learning_rate": 6.135972941196149e-05, "loss": -0.0003, "num_tokens": 5206744.0, "reward": -0.3612610995769501, "reward_std": 0.2435275912284851, "rewards/rollout_reward_func/mean": -0.3612610995769501, "rewards/rollout_reward_func/std": 0.23848959803581238, "sampling/importance_sampling_ratio/max": 1.241699457168579, "sampling/importance_sampling_ratio/mean": 0.9999805092811584, "sampling/importance_sampling_ratio/min": 0.7010536193847656, "sampling/sampling_logp_difference/max": 0.35517096519470215, "sampling/sampling_logp_difference/mean": 0.005780600942671299, "step": 163, "step_time": 21.915730714998062 }, { "clip_ratio/high_max": 0.021037581842392683, "clip_ratio/high_mean": 0.01225490216165781, "clip_ratio/low_mean": 0.005310457549057901, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01756535971071571, "entropy": 0.06519393436610699, "epoch": 0.00164, "grad_norm": 0.16093458235263824, "kl": 0.5271580889821053, "learning_rate": 6.121286774641694e-05, "loss": -0.0029, "step": 164, "step_time": 5.349044247997881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 1418.65625, "completions/mean_terminated_length": 1418.65625, "completions/min_length": 1287.0, "completions/min_terminated_length": 1287.0, "entropy": 0.0829668384976685, "epoch": 0.00165, "frac_reward_zero_std": 0.0, "grad_norm": 0.5243796706199646, "kl": 0.8073729686439037, "learning_rate": 6.106498438027262e-05, "loss": 0.0034, "num_tokens": 5277727.0, "reward": -0.2886959910392761, "reward_std": 0.20568063855171204, "rewards/rollout_reward_func/mean": -0.2886959910392761, "rewards/rollout_reward_func/std": 0.20821760594844818, "sampling/importance_sampling_ratio/max": 1.2351570129394531, "sampling/importance_sampling_ratio/mean": 0.9996572732925415, "sampling/importance_sampling_ratio/min": 0.22778014838695526, "sampling/sampling_logp_difference/max": 1.4793744087219238, "sampling/sampling_logp_difference/mean": 0.008630713447928429, "step": 165, "step_time": 22.559136789997865 }, { "clip_ratio/high_max": 0.010515873087570071, "clip_ratio/high_mean": 0.005257936543785036, "clip_ratio/low_mean": 0.010515873087570071, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.01577380974777043, "entropy": 0.08756258385255933, "epoch": 0.00166, "grad_norm": 0.7616586685180664, "kl": 0.42006588354706764, "learning_rate": 6.091608652115516e-05, "loss": 0.008, "step": 166, "step_time": 5.369019541001762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 1315.59375, "completions/mean_terminated_length": 1315.59375, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "entropy": 0.08452755119651556, "epoch": 0.00167, "frac_reward_zero_std": 0.0, "grad_norm": 0.6370934844017029, "kl": 0.4675045907497406, "learning_rate": 6.0766181426136166e-05, "loss": -0.0062, "num_tokens": 5345312.0, "reward": -0.36821120977401733, "reward_std": 0.2555796802043915, "rewards/rollout_reward_func/mean": -0.36821120977401733, "rewards/rollout_reward_func/std": 0.27734851837158203, "sampling/importance_sampling_ratio/max": 1.347963571548462, "sampling/importance_sampling_ratio/mean": 0.9996823072433472, "sampling/importance_sampling_ratio/min": 0.6569048166275024, "sampling/sampling_logp_difference/max": 0.4202161431312561, "sampling/sampling_logp_difference/mean": 0.006768511608242989, "step": 167, "step_time": 22.3090551319965 }, { "clip_ratio/high_max": 0.014285714365541935, "clip_ratio/high_mean": 0.0071428571827709675, "clip_ratio/low_mean": 0.015823412803001702, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.02296627010218799, "entropy": 0.08340210095047951, "epoch": 0.00168, "grad_norm": 0.2162996232509613, "kl": 0.5694398619234562, "learning_rate": 6.0615276401378485e-05, "loss": -0.0085, "step": 168, "step_time": 5.161122823999904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 1383.96875, "completions/mean_terminated_length": 1383.96875, "completions/min_length": 1251.0, "completions/min_terminated_length": 1251.0, "entropy": 0.08735544933006167, "epoch": 0.00169, "frac_reward_zero_std": 0.0, "grad_norm": 0.4866945147514343, "kl": 1.0533548630774021, "learning_rate": 6.046337880178011e-05, "loss": 0.0083, "num_tokens": 5415154.0, "reward": -0.3454635739326477, "reward_std": 0.27443355321884155, "rewards/rollout_reward_func/mean": -0.3454635739326477, "rewards/rollout_reward_func/std": 0.2638706862926483, "sampling/importance_sampling_ratio/max": 1.419394612312317, "sampling/importance_sampling_ratio/mean": 1.001217246055603, "sampling/importance_sampling_ratio/min": 0.7310315370559692, "sampling/sampling_logp_difference/max": 0.35023045539855957, "sampling/sampling_logp_difference/mean": 0.004952777177095413, "step": 169, "step_time": 21.714275573000123 }, { "clip_ratio/high_max": 0.013988095335662365, "clip_ratio/high_mean": 0.0069940476678311825, "clip_ratio/low_mean": 0.016914682695642114, "clip_ratio/low_min": 0.010416666744276881, "clip_ratio/region_mean": 0.023908730479888618, "entropy": 0.09184496756643057, "epoch": 0.0017, "grad_norm": 0.14754191040992737, "kl": 0.7493918314576149, "learning_rate": 6.031049603061577e-05, "loss": 0.0047, "step": 170, "step_time": 5.2702093240040995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 1406.46875, "completions/mean_terminated_length": 1406.46875, "completions/min_length": 1296.0, "completions/min_terminated_length": 1296.0, "entropy": 0.13914490398019552, "epoch": 0.00171, "frac_reward_zero_std": 0.0, "grad_norm": 0.49122703075408936, "kl": 0.5209255404770374, "learning_rate": 6.0156635539176026e-05, "loss": 0.0066, "num_tokens": 5485780.0, "reward": -0.3194975256919861, "reward_std": 0.21816720068454742, "rewards/rollout_reward_func/mean": -0.3194975256919861, "rewards/rollout_reward_func/std": 0.22450043261051178, "sampling/importance_sampling_ratio/max": 1.3527188301086426, "sampling/importance_sampling_ratio/mean": 1.0011281967163086, "sampling/importance_sampling_ratio/min": 0.8163910508155823, "sampling/sampling_logp_difference/max": 0.3021165132522583, "sampling/sampling_logp_difference/mean": 0.006197115406394005, "step": 171, "step_time": 23.10362755100323 }, { "clip_ratio/high_max": 0.006071428535506129, "clip_ratio/high_mean": 0.0030357142677530646, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006941964267753065, "entropy": 0.13829630333930254, "epoch": 0.00172, "grad_norm": 0.24552969634532928, "kl": 0.5439451225101948, "learning_rate": 6.000180482640418e-05, "loss": 0.0029, "step": 172, "step_time": 6.3299720879986126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 1365.8125, "completions/mean_terminated_length": 1365.8125, "completions/min_length": 1262.0, "completions/min_terminated_length": 1262.0, "entropy": 0.09364470187574625, "epoch": 0.00173, "frac_reward_zero_std": 0.0, "grad_norm": 0.33461546897888184, "kl": 0.4023494981229305, "learning_rate": 5.9846011438530735e-05, "loss": -0.0003, "num_tokens": 5555022.0, "reward": -0.3629325032234192, "reward_std": 0.1729220300912857, "rewards/rollout_reward_func/mean": -0.3629325032234192, "rewards/rollout_reward_func/std": 0.22274866700172424, "sampling/importance_sampling_ratio/max": 1.2739063501358032, "sampling/importance_sampling_ratio/mean": 0.9994348883628845, "sampling/importance_sampling_ratio/min": 0.7954524159431458, "sampling/sampling_logp_difference/max": 0.24208801984786987, "sampling/sampling_logp_difference/mean": 0.0053595914505422115, "step": 173, "step_time": 22.12134751800113 }, { "clip_ratio/high_max": 0.020801012869924307, "clip_ratio/high_mean": 0.010400506434962153, "clip_ratio/low_mean": 0.008981092483736575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019381598918698728, "entropy": 0.1040582712739706, "epoch": 0.00174, "grad_norm": 0.18805073201656342, "kl": 0.40879157558083534, "learning_rate": 5.968926296870564e-05, "loss": -0.0015, "step": 174, "step_time": 5.227478744000109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 1402.71875, "completions/mean_terminated_length": 1400.9676513671875, "completions/min_length": 1063.0, "completions/min_terminated_length": 1063.0, "entropy": 0.17123102210462093, "epoch": 0.00175, "frac_reward_zero_std": 0.0, "grad_norm": 0.7787586450576782, "kl": 0.667484425008297, "learning_rate": 5.9531567056628145e-05, "loss": -0.2427, "num_tokens": 5625557.0, "reward": -0.34476056694984436, "reward_std": 0.2764643430709839, "rewards/rollout_reward_func/mean": -0.34476056694984436, "rewards/rollout_reward_func/std": 0.2936684191226959, "sampling/importance_sampling_ratio/max": 1.522752046585083, "sampling/importance_sampling_ratio/mean": 1.0025287866592407, "sampling/importance_sampling_ratio/min": 0.7351745367050171, "sampling/sampling_logp_difference/max": 0.42051923274993896, "sampling/sampling_logp_difference/mean": 0.012459341436624527, "step": 175, "step_time": 22.63647987299919 }, { "clip_ratio/high_max": 0.02364684676285833, "clip_ratio/high_mean": 0.01638691540574655, "clip_ratio/low_mean": 0.028645347920246422, "clip_ratio/low_min": 0.01365740760229528, "clip_ratio/region_mean": 0.04503226315136999, "entropy": 0.19258347898721695, "epoch": 0.00176, "grad_norm": 0.4203491806983948, "kl": 0.7775666117668152, "learning_rate": 5.937293138817454e-05, "loss": -0.248, "step": 176, "step_time": 5.8163775660032115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1550.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 1370.875, "completions/mean_terminated_length": 1358.137939453125, "completions/min_length": 1212.0, "completions/min_terminated_length": 1212.0, "entropy": 0.2705659456551075, "epoch": 0.00177, "frac_reward_zero_std": 0.0, "grad_norm": 0.8998391032218933, "kl": 0.9797024875879288, "learning_rate": 5.921336369502351e-05, "loss": -0.1517, "num_tokens": 5694898.0, "reward": -0.29364699125289917, "reward_std": 0.24293771386146545, "rewards/rollout_reward_func/mean": -0.29364699125289917, "rewards/rollout_reward_func/std": 0.26164188981056213, "sampling/importance_sampling_ratio/max": 1.6261235475540161, "sampling/importance_sampling_ratio/mean": 1.000903844833374, "sampling/importance_sampling_ratio/min": 0.2690878212451935, "sampling/sampling_logp_difference/max": 1.3127174377441406, "sampling/sampling_logp_difference/mean": 0.019402947276830673, "step": 177, "step_time": 24.965951733996917 }, { "clip_ratio/high_max": 0.024081207579001784, "clip_ratio/high_mean": 0.014676658436655998, "clip_ratio/low_mean": 0.03735651134047657, "clip_ratio/low_min": 0.018974531209096313, "clip_ratio/region_mean": 0.05203316966071725, "entropy": 0.28039277344942093, "epoch": 0.00178, "grad_norm": 0.7427012324333191, "kl": 1.0252062007784843, "learning_rate": 5.905287175427931e-05, "loss": -0.1657, "step": 178, "step_time": 5.267517704998681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1566.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 1380.0, "completions/mean_terminated_length": 1354.125, "completions/min_length": 1248.0, "completions/min_terminated_length": 1248.0, "entropy": 0.2824288532137871, "epoch": 0.00179, "frac_reward_zero_std": 0.0, "grad_norm": 0.7640906572341919, "kl": 0.8127790912985802, "learning_rate": 5.88914633880927e-05, "loss": -0.0032, "num_tokens": 5764568.0, "reward": -0.2887653708457947, "reward_std": 0.18895351886749268, "rewards/rollout_reward_func/mean": -0.2887653708457947, "rewards/rollout_reward_func/std": 0.19820521771907806, "sampling/importance_sampling_ratio/max": 1.7501327991485596, "sampling/importance_sampling_ratio/mean": 0.9989570379257202, "sampling/importance_sampling_ratio/min": 0.6625958681106567, "sampling/sampling_logp_difference/max": 0.5596916675567627, "sampling/sampling_logp_difference/mean": 0.01639205403625965, "step": 179, "step_time": 25.13392851000026 }, { "clip_ratio/high_max": 0.04037906741723418, "clip_ratio/high_mean": 0.021571849705651402, "clip_ratio/low_mean": 0.01993165723979473, "clip_ratio/low_min": 0.008850423386320472, "clip_ratio/region_mean": 0.04150350671261549, "entropy": 0.2766301892697811, "epoch": 0.0018, "grad_norm": 0.729733943939209, "kl": 0.7896942421793938, "learning_rate": 5.872914646327972e-05, "loss": -0.0115, "step": 180, "step_time": 5.312245937000625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 1434.8125, "completions/mean_terminated_length": 1412.2222900390625, "completions/min_length": 1248.0, "completions/min_terminated_length": 1248.0, "entropy": 0.35496676340699196, "epoch": 0.00181, "frac_reward_zero_std": 0.0, "grad_norm": 0.8963547945022583, "kl": 0.7206331565976143, "learning_rate": 5.856592889093833e-05, "loss": 0.2326, "num_tokens": 5836041.0, "reward": -0.37407177686691284, "reward_std": 0.20317839086055756, "rewards/rollout_reward_func/mean": -0.37407177686691284, "rewards/rollout_reward_func/std": 0.2206260710954666, "sampling/importance_sampling_ratio/max": 1.3628172874450684, "sampling/importance_sampling_ratio/mean": 1.000978708267212, "sampling/importance_sampling_ratio/min": 0.6258657574653625, "sampling/sampling_logp_difference/max": 0.46861934661865234, "sampling/sampling_logp_difference/mean": 0.0184138435870409, "step": 181, "step_time": 27.150895172002492 }, { "clip_ratio/high_max": 0.020431800629012287, "clip_ratio/high_mean": 0.01189985265955329, "clip_ratio/low_mean": 0.021856972482055426, "clip_ratio/low_min": 0.005292319576255977, "clip_ratio/region_mean": 0.033756825840100646, "entropy": 0.3485443666577339, "epoch": 0.00182, "grad_norm": 0.6160404086112976, "kl": 0.7425114065408707, "learning_rate": 5.840181862606271e-05, "loss": 0.221, "step": 182, "step_time": 5.887024248000671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1687.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 1399.1875, "completions/mean_terminated_length": 1378.3043212890625, "completions/min_length": 1267.0, "completions/min_terminated_length": 1267.0, "entropy": 0.29488292150199413, "epoch": 0.00183, "frac_reward_zero_std": 0.0, "grad_norm": 0.8933266997337341, "kl": 0.7591585107147694, "learning_rate": 5.8236823667155656e-05, "loss": 0.2712, "num_tokens": 5906324.0, "reward": -0.2797722816467285, "reward_std": 0.23282885551452637, "rewards/rollout_reward_func/mean": -0.2797722816467285, "rewards/rollout_reward_func/std": 0.23616720736026764, "sampling/importance_sampling_ratio/max": 1.6696398258209229, "sampling/importance_sampling_ratio/mean": 1.000313639640808, "sampling/importance_sampling_ratio/min": 0.6033305525779724, "sampling/sampling_logp_difference/max": 0.5126079320907593, "sampling/sampling_logp_difference/mean": 0.01613360270857811, "step": 183, "step_time": 26.127671610000107 }, { "clip_ratio/high_max": 0.03695504565257579, "clip_ratio/high_mean": 0.026847063447348773, "clip_ratio/low_mean": 0.02087040978949517, "clip_ratio/low_min": 0.00867704686243087, "clip_ratio/region_mean": 0.047717473236843944, "entropy": 0.28915288858115673, "epoch": 0.00184, "grad_norm": 0.8169076442718506, "kl": 0.7234941683709621, "learning_rate": 5.807095205583868e-05, "loss": 0.2601, "step": 184, "step_time": 5.527719798996259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1477.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 1371.75, "completions/mean_terminated_length": 1349.0, "completions/min_length": 1244.0, "completions/min_terminated_length": 1244.0, "entropy": 0.2280177678912878, "epoch": 0.00185, "frac_reward_zero_std": 0.0, "grad_norm": 0.830865204334259, "kl": 0.6856850758194923, "learning_rate": 5.790421187646012e-05, "loss": 0.3041, "num_tokens": 5975699.0, "reward": -0.3061700463294983, "reward_std": 0.25091445446014404, "rewards/rollout_reward_func/mean": -0.3061700463294983, "rewards/rollout_reward_func/std": 0.2691905200481415, "sampling/importance_sampling_ratio/max": 1.2828229665756226, "sampling/importance_sampling_ratio/mean": 0.999343752861023, "sampling/importance_sampling_ratio/min": 0.6843889951705933, "sampling/sampling_logp_difference/max": 0.3792288303375244, "sampling/sampling_logp_difference/mean": 0.012603959068655968, "step": 185, "step_time": 25.269482095996864 }, { "clip_ratio/high_max": 0.014544930774718523, "clip_ratio/high_mean": 0.007964991265907884, "clip_ratio/low_mean": 0.01945860293926671, "clip_ratio/low_min": 0.004167181206867099, "clip_ratio/region_mean": 0.027423594205174595, "entropy": 0.22204891592264175, "epoch": 0.00186, "grad_norm": 0.5350422263145447, "kl": 0.7192875221371651, "learning_rate": 5.773661125570107e-05, "loss": 0.2927, "step": 186, "step_time": 6.166586990000724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 1376.40625, "completions/mean_terminated_length": 1369.60009765625, "completions/min_length": 1236.0, "completions/min_terminated_length": 1236.0, "entropy": 0.24230779334902763, "epoch": 0.00187, "frac_reward_zero_std": 0.0, "grad_norm": 0.6572520732879639, "kl": 0.6323206033557653, "learning_rate": 5.7568158362179356e-05, "loss": 0.0289, "num_tokens": 6045263.0, "reward": -0.28280019760131836, "reward_std": 0.3265167474746704, "rewards/rollout_reward_func/mean": -0.28280019760131836, "rewards/rollout_reward_func/std": 0.3527776896953583, "sampling/importance_sampling_ratio/max": 1.6859551668167114, "sampling/importance_sampling_ratio/mean": 1.000498652458191, "sampling/importance_sampling_ratio/min": 0.6922063231468201, "sampling/sampling_logp_difference/max": 0.5223323106765747, "sampling/sampling_logp_difference/mean": 0.01742139831185341, "step": 187, "step_time": 23.636038182999982 }, { "clip_ratio/high_max": 0.013951554195955396, "clip_ratio/high_mean": 0.008277860470116138, "clip_ratio/low_mean": 0.023003130801953375, "clip_ratio/low_min": 0.009785353671759367, "clip_ratio/region_mean": 0.03128099150490016, "entropy": 0.2358413664624095, "epoch": 0.00188, "grad_norm": 0.5200794339179993, "kl": 0.6190500557422638, "learning_rate": 5.739886140605134e-05, "loss": 0.0208, "step": 188, "step_time": 5.230759133997708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 1365.625, "completions/mean_terminated_length": 1365.10009765625, "completions/min_length": 1186.0, "completions/min_terminated_length": 1186.0, "entropy": 0.3350282786414027, "epoch": 0.00189, "frac_reward_zero_std": 0.0, "grad_norm": 0.9670495986938477, "kl": 0.8959760889410973, "learning_rate": 5.7228728638611844e-05, "loss": -0.2249, "num_tokens": 6114482.0, "reward": -0.34656822681427, "reward_std": 0.21292103826999664, "rewards/rollout_reward_func/mean": -0.34656822681427, "rewards/rollout_reward_func/std": 0.2278636395931244, "sampling/importance_sampling_ratio/max": 1.4408118724822998, "sampling/importance_sampling_ratio/mean": 0.9997934103012085, "sampling/importance_sampling_ratio/min": 0.3205134868621826, "sampling/sampling_logp_difference/max": 1.1378309726715088, "sampling/sampling_logp_difference/mean": 0.02067149057984352, "step": 189, "step_time": 23.5755817590034 }, { "clip_ratio/high_max": 0.025011671939864755, "clip_ratio/high_mean": 0.022237948374822736, "clip_ratio/low_mean": 0.027734285918995738, "clip_ratio/low_min": 0.012602133443579078, "clip_ratio/region_mean": 0.04997223545797169, "entropy": 0.33259922731667757, "epoch": 0.0019, "grad_norm": 1.1187939643859863, "kl": 0.8970839232206345, "learning_rate": 5.7057768351891896e-05, "loss": -0.2295, "step": 190, "step_time": 5.174691636999341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1526.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 1396.125, "completions/mean_terminated_length": 1378.5238037109375, "completions/min_length": 1243.0, "completions/min_terminated_length": 1243.0, "entropy": 0.2976476922631264, "epoch": 0.00191, "frac_reward_zero_std": 0.0, "grad_norm": 0.9142627120018005, "kl": 0.7026498690247536, "learning_rate": 5.688598887825471e-05, "loss": 0.2117, "num_tokens": 6184680.0, "reward": -0.2714994549751282, "reward_std": 0.2981591820716858, "rewards/rollout_reward_func/mean": -0.2714994549751282, "rewards/rollout_reward_func/std": 0.3146918714046478, "sampling/importance_sampling_ratio/max": 1.6815978288650513, "sampling/importance_sampling_ratio/mean": 0.99751877784729, "sampling/importance_sampling_ratio/min": 0.4942559599876404, "sampling/sampling_logp_difference/max": 0.7047017812728882, "sampling/sampling_logp_difference/mean": 0.018766513094305992, "step": 191, "step_time": 26.044353490000503 }, { "clip_ratio/high_max": 0.021012298529967666, "clip_ratio/high_mean": 0.012414546101354063, "clip_ratio/low_mean": 0.01626936940010637, "clip_ratio/low_min": 0.004953069495968521, "clip_ratio/region_mean": 0.02868391538504511, "entropy": 0.2896879091858864, "epoch": 0.00192, "grad_norm": 0.638110876083374, "kl": 0.7114327773451805, "learning_rate": 5.671339858998949e-05, "loss": 0.202, "step": 192, "step_time": 5.2386240960040595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1388.09375, "completions/mean_terminated_length": 1381.0, "completions/min_length": 1272.0, "completions/min_terminated_length": 1272.0, "entropy": 0.24199994280934334, "epoch": 0.00193, "frac_reward_zero_std": 0.0, "grad_norm": 1.0989079475402832, "kl": 0.9861822128295898, "learning_rate": 5.6540005898903366e-05, "loss": 0.1331, "num_tokens": 6254616.0, "reward": -0.2982085049152374, "reward_std": 0.21444223821163177, "rewards/rollout_reward_func/mean": -0.2982085049152374, "rewards/rollout_reward_func/std": 0.21825402975082397, "sampling/importance_sampling_ratio/max": 1.4363564252853394, "sampling/importance_sampling_ratio/mean": 0.9997080564498901, "sampling/importance_sampling_ratio/min": 0.6372756361961365, "sampling/sampling_logp_difference/max": 0.45055294036865234, "sampling/sampling_logp_difference/mean": 0.01593942567706108, "step": 193, "step_time": 23.73952172999634 }, { "clip_ratio/high_max": 0.017689965781755745, "clip_ratio/high_mean": 0.009580276964697987, "clip_ratio/low_mean": 0.027442879567388445, "clip_ratio/low_min": 0.01548924739472568, "clip_ratio/region_mean": 0.03702315647387877, "entropy": 0.23389813117682934, "epoch": 0.00194, "grad_norm": 0.7611336708068848, "kl": 1.0061018094420433, "learning_rate": 5.636581925591151e-05, "loss": 0.1227, "step": 194, "step_time": 5.265725776000181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1708.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 1413.96875, "completions/mean_terminated_length": 1403.71435546875, "completions/min_length": 1255.0, "completions/min_terminated_length": 1255.0, "entropy": 0.24653965793550014, "epoch": 0.00195, "frac_reward_zero_std": 0.0, "grad_norm": 1.083544373512268, "kl": 1.1075733676552773, "learning_rate": 5.6190847150625144e-05, "loss": -0.0762, "num_tokens": 6325430.0, "reward": -0.3674285411834717, "reward_std": 0.20667299628257751, "rewards/rollout_reward_func/mean": -0.3674285411834717, "rewards/rollout_reward_func/std": 0.23734495043754578, "sampling/importance_sampling_ratio/max": 2.9900710582733154, "sampling/importance_sampling_ratio/mean": 1.0014631748199463, "sampling/importance_sampling_ratio/min": 0.11657001823186874, "sampling/sampling_logp_difference/max": 2.1492631435394287, "sampling/sampling_logp_difference/mean": 0.02286156266927719, "step": 195, "step_time": 25.042322685998442 }, { "clip_ratio/high_max": 0.03843500465154648, "clip_ratio/high_mean": 0.02081770682707429, "clip_ratio/low_mean": 0.01731083437334746, "clip_ratio/low_min": 0.0071428571827709675, "clip_ratio/region_mean": 0.03812854120042175, "entropy": 0.24831104092299938, "epoch": 0.00196, "grad_norm": 0.6835814714431763, "kl": 1.0841351598501205, "learning_rate": 5.601509811093784e-05, "loss": -0.087, "step": 196, "step_time": 6.935342509003021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 1429.0625, "completions/mean_terminated_length": 1412.8182373046875, "completions/min_length": 1298.0, "completions/min_terminated_length": 1298.0, "entropy": 0.3409156911075115, "epoch": 0.00197, "frac_reward_zero_std": 0.0, "grad_norm": 1.4692267179489136, "kl": 0.9723362550139427, "learning_rate": 5.5838580702609855e-05, "loss": 0.421, "num_tokens": 6396738.0, "reward": -0.361378937959671, "reward_std": 0.2009008228778839, "rewards/rollout_reward_func/mean": -0.361378937959671, "rewards/rollout_reward_func/std": 0.20180241763591766, "sampling/importance_sampling_ratio/max": 1.5989360809326172, "sampling/importance_sampling_ratio/mean": 0.9999433755874634, "sampling/importance_sampling_ratio/min": 0.4515508711338043, "sampling/sampling_logp_difference/max": 0.795067310333252, "sampling/sampling_logp_difference/mean": 0.025070182979106903, "step": 197, "step_time": 24.958232220002174 }, { "clip_ratio/high_max": 0.02129257144406438, "clip_ratio/high_mean": 0.01271240133792162, "clip_ratio/low_mean": 0.026925578364171088, "clip_ratio/low_min": 0.006832379498519003, "clip_ratio/region_mean": 0.03963797970209271, "entropy": 0.3410366214811802, "epoch": 0.00198, "grad_norm": 0.9190803170204163, "kl": 0.9648435786366463, "learning_rate": 5.566130352885062e-05, "loss": 0.4015, "step": 198, "step_time": 5.25340941200011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 1378.1875, "completions/mean_terminated_length": 1377.10009765625, "completions/min_length": 1240.0, "completions/min_terminated_length": 1240.0, "entropy": 0.2282365132123232, "epoch": 0.00199, "frac_reward_zero_std": 0.0, "grad_norm": 2.0103631019592285, "kl": 1.1482964158058167, "learning_rate": 5.54832752298995e-05, "loss": 0.172, "num_tokens": 6466365.0, "reward": -0.40189409255981445, "reward_std": 0.2048446387052536, "rewards/rollout_reward_func/mean": -0.40189409255981445, "rewards/rollout_reward_func/std": 0.216032937169075, "sampling/importance_sampling_ratio/max": 2.8838419914245605, "sampling/importance_sampling_ratio/mean": 1.0022861957550049, "sampling/importance_sampling_ratio/min": 0.1418568193912506, "sampling/sampling_logp_difference/max": 1.9529370069503784, "sampling/sampling_logp_difference/mean": 0.02343554049730301, "step": 199, "step_time": 22.943363440997928 }, { "clip_ratio/high_max": 0.024403998162597418, "clip_ratio/high_mean": 0.012201999081298709, "clip_ratio/low_mean": 0.027254750719293952, "clip_ratio/low_min": 0.016701093409210443, "clip_ratio/region_mean": 0.039456749334931374, "entropy": 0.22876960039138794, "epoch": 0.002, "grad_norm": 0.8000804781913757, "kl": 1.1147452145814896, "learning_rate": 5.5304504482604614e-05, "loss": 0.1666, "step": 200, "step_time": 5.807316655998875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 1389.71875, "completions/mean_terminated_length": 1388.0966796875, "completions/min_length": 1259.0, "completions/min_terminated_length": 1259.0, "entropy": 0.14839737303555012, "epoch": 0.00201, "frac_reward_zero_std": 0.0, "grad_norm": 0.9227963089942932, "kl": 1.5733852088451385, "learning_rate": 5.512499999999999e-05, "loss": 0.1977, "num_tokens": 6536365.0, "reward": -0.31250572204589844, "reward_std": 0.21716922521591187, "rewards/rollout_reward_func/mean": -0.31250572204589844, "rewards/rollout_reward_func/std": 0.22341714799404144, "sampling/importance_sampling_ratio/max": 1.5922414064407349, "sampling/importance_sampling_ratio/mean": 0.9995514154434204, "sampling/importance_sampling_ratio/min": 0.580122709274292, "sampling/sampling_logp_difference/max": 0.5445156097412109, "sampling/sampling_logp_difference/mean": 0.01445447001606226, "step": 201, "step_time": 24.356689609003297 }, { "clip_ratio/high_max": 0.015827361145056784, "clip_ratio/high_mean": 0.008846516429912299, "clip_ratio/low_mean": 0.022368015139363706, "clip_ratio/low_min": 0.003821839112788439, "clip_ratio/region_mean": 0.0312145312782377, "entropy": 0.15504851192235947, "epoch": 0.00202, "grad_norm": 0.543289303779602, "kl": 1.3054933845996857, "learning_rate": 5.494477053088087e-05, "loss": 0.1884, "step": 202, "step_time": 5.336076139998113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 1352.9375, "completions/mean_terminated_length": 1352.9375, "completions/min_length": 1210.0, "completions/min_terminated_length": 1210.0, "entropy": 0.18713398836553097, "epoch": 0.00203, "frac_reward_zero_std": 0.0, "grad_norm": 0.8426447510719299, "kl": 1.4434166848659515, "learning_rate": 5.476382485937729e-05, "loss": 0.0724, "num_tokens": 6605183.0, "reward": -0.36560648679733276, "reward_std": 0.1975928544998169, "rewards/rollout_reward_func/mean": -0.36560648679733276, "rewards/rollout_reward_func/std": 0.21334274113178253, "sampling/importance_sampling_ratio/max": 1.4104684591293335, "sampling/importance_sampling_ratio/mean": 0.9996151924133301, "sampling/importance_sampling_ratio/min": 0.7314652800559998, "sampling/sampling_logp_difference/max": 0.3439218997955322, "sampling/sampling_logp_difference/mean": 0.015328658744692802, "step": 203, "step_time": 22.47584490400004 }, { "clip_ratio/high_max": 0.034462666837498546, "clip_ratio/high_mean": 0.020907803787849844, "clip_ratio/low_mean": 0.022808443871326745, "clip_ratio/low_min": 0.014086952432990074, "clip_ratio/region_mean": 0.04371624789200723, "entropy": 0.19931460358202457, "epoch": 0.00204, "grad_norm": 0.45797812938690186, "kl": 1.2634044885635376, "learning_rate": 5.458217180452602e-05, "loss": 0.0654, "step": 204, "step_time": 5.222942520997094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 1362.5, "completions/mean_terminated_length": 1362.5, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "entropy": 0.22297158651053905, "epoch": 0.00205, "frac_reward_zero_std": 0.0, "grad_norm": 0.6819816827774048, "kl": 1.2924859821796417, "learning_rate": 5.439982021984069e-05, "loss": -0.0197, "num_tokens": 6674319.0, "reward": -0.3272818922996521, "reward_std": 0.33916381001472473, "rewards/rollout_reward_func/mean": -0.3272818922996521, "rewards/rollout_reward_func/std": 0.36076751351356506, "sampling/importance_sampling_ratio/max": 1.5597563982009888, "sampling/importance_sampling_ratio/mean": 1.00142240524292, "sampling/importance_sampling_ratio/min": 0.6801483035087585, "sampling/sampling_logp_difference/max": 0.44452965259552, "sampling/sampling_logp_difference/mean": 0.015292677097022533, "step": 205, "step_time": 22.862623359997087 }, { "clip_ratio/high_max": 0.02239961549639702, "clip_ratio/high_mean": 0.017678433563560247, "clip_ratio/low_mean": 0.012258673086762428, "clip_ratio/low_min": 0.0076306844130158424, "clip_ratio/region_mean": 0.029937105951830745, "entropy": 0.23203575797379017, "epoch": 0.00206, "grad_norm": 0.4359395503997803, "kl": 1.2874633967876434, "learning_rate": 5.421677899288024e-05, "loss": -0.0246, "step": 206, "step_time": 6.192528550000134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1383.5625, "completions/mean_terminated_length": 1383.5625, "completions/min_length": 1146.0, "completions/min_terminated_length": 1146.0, "entropy": 0.21756884083151817, "epoch": 0.00207, "frac_reward_zero_std": 0.0, "grad_norm": 0.8522008061408997, "kl": 1.4864706173539162, "learning_rate": 5.403305704481589e-05, "loss": 0.0512, "num_tokens": 6744143.0, "reward": -0.3393877446651459, "reward_std": 0.3063514232635498, "rewards/rollout_reward_func/mean": -0.3393877446651459, "rewards/rollout_reward_func/std": 0.3566916882991791, "sampling/importance_sampling_ratio/max": 1.371942162513733, "sampling/importance_sampling_ratio/mean": 1.0016239881515503, "sampling/importance_sampling_ratio/min": 0.5216642618179321, "sampling/sampling_logp_difference/max": 0.650731086730957, "sampling/sampling_logp_difference/mean": 0.014331892132759094, "step": 207, "step_time": 22.399081269999442 }, { "clip_ratio/high_max": 0.019414583453908563, "clip_ratio/high_mean": 0.009707291726954281, "clip_ratio/low_mean": 0.031709898728877306, "clip_ratio/low_min": 0.015184038784354925, "clip_ratio/region_mean": 0.04141719080507755, "entropy": 0.23709845170378685, "epoch": 0.00208, "grad_norm": 0.5248006582260132, "kl": 1.3648644983768463, "learning_rate": 5.384866332999615e-05, "loss": 0.0475, "step": 208, "step_time": 5.26467002100253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1648.0, "completions/max_terminated_length": 1587.0, "completions/mean_length": 1389.34375, "completions/mean_terminated_length": 1377.300048828125, "completions/min_length": 1160.0, "completions/min_terminated_length": 1160.0, "entropy": 0.25011553056538105, "epoch": 0.00209, "frac_reward_zero_std": 0.0, "grad_norm": 0.7820115089416504, "kl": 1.2170803546905518, "learning_rate": 5.3663606835510564e-05, "loss": -0.0359, "num_tokens": 6814185.0, "reward": -0.37812256813049316, "reward_std": 0.24015262722969055, "rewards/rollout_reward_func/mean": -0.37812256813049316, "rewards/rollout_reward_func/std": 0.261532187461853, "sampling/importance_sampling_ratio/max": 1.524796485900879, "sampling/importance_sampling_ratio/mean": 0.999364972114563, "sampling/importance_sampling_ratio/min": 0.7068181037902832, "sampling/sampling_logp_difference/max": 0.421860933303833, "sampling/sampling_logp_difference/mean": 0.014625634998083115, "step": 209, "step_time": 22.935232449999603 }, { "clip_ratio/high_max": 0.032479636487551033, "clip_ratio/high_mean": 0.01835846231551841, "clip_ratio/low_mean": 0.01918400323484093, "clip_ratio/low_min": 0.004825036274269223, "clip_ratio/region_mean": 0.037542465957812965, "entropy": 0.2524045743048191, "epoch": 0.0021, "grad_norm": 0.4722338616847992, "kl": 1.2736061662435532, "learning_rate": 5.3477896580751593e-05, "loss": -0.0466, "step": 210, "step_time": 5.983930624001005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 1373.59375, "completions/mean_terminated_length": 1370.741943359375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "entropy": 0.37133462727069855, "epoch": 0.00211, "frac_reward_zero_std": 0.0, "grad_norm": 1.0533723831176758, "kl": 1.4984942823648453, "learning_rate": 5.329154161697505e-05, "loss": -0.1553, "num_tokens": 6883756.0, "reward": -0.47837746143341064, "reward_std": 0.23657377064228058, "rewards/rollout_reward_func/mean": -0.47837746143341064, "rewards/rollout_reward_func/std": 0.24600698053836823, "sampling/importance_sampling_ratio/max": 1.840681791305542, "sampling/importance_sampling_ratio/mean": 1.001286268234253, "sampling/importance_sampling_ratio/min": 0.15406915545463562, "sampling/sampling_logp_difference/max": 1.8703536987304688, "sampling/sampling_logp_difference/mean": 0.024860475212335587, "step": 211, "step_time": 23.391987199998766 }, { "clip_ratio/high_max": 0.045271459268406034, "clip_ratio/high_mean": 0.02504248369950801, "clip_ratio/low_mean": 0.013543764245696366, "clip_ratio/low_min": 0.010449161287397146, "clip_ratio/region_mean": 0.0385862480616197, "entropy": 0.3774155415594578, "epoch": 0.00212, "grad_norm": 0.6968172192573547, "kl": 1.454681470990181, "learning_rate": 5.310455102685897e-05, "loss": -0.1687, "step": 212, "step_time": 5.486532523000278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 1359.46875, "completions/mean_terminated_length": 1356.1290283203125, "completions/min_length": 1103.0, "completions/min_terminated_length": 1103.0, "entropy": 0.32659154385328293, "epoch": 0.00213, "frac_reward_zero_std": 0.0, "grad_norm": 1.078034520149231, "kl": 1.3498117551207542, "learning_rate": 5.291693392406091e-05, "loss": 0.1252, "num_tokens": 6952821.0, "reward": -0.2653856873512268, "reward_std": 0.3602794110774994, "rewards/rollout_reward_func/mean": -0.2653856873512268, "rewards/rollout_reward_func/std": 0.4300389885902405, "sampling/importance_sampling_ratio/max": 1.4525595903396606, "sampling/importance_sampling_ratio/mean": 0.9994673728942871, "sampling/importance_sampling_ratio/min": 0.5413858890533447, "sampling/sampling_logp_difference/max": 0.6136230230331421, "sampling/sampling_logp_difference/mean": 0.01882324367761612, "step": 213, "step_time": 22.02800410099917 }, { "clip_ratio/high_max": 0.029498543357476592, "clip_ratio/high_mean": 0.01597476180177182, "clip_ratio/low_mean": 0.03302490431815386, "clip_ratio/low_min": 0.025227966252714396, "clip_ratio/region_mean": 0.048999666003510356, "entropy": 0.329021118581295, "epoch": 0.00214, "grad_norm": 0.5261781811714172, "kl": 1.3097499310970306, "learning_rate": 5.2728699452773764e-05, "loss": 0.1154, "step": 214, "step_time": 5.2251156639995315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 1383.125, "completions/mean_terminated_length": 1378.7667236328125, "completions/min_length": 1265.0, "completions/min_terminated_length": 1265.0, "entropy": 0.3268757350742817, "epoch": 0.00215, "frac_reward_zero_std": 0.0, "grad_norm": 1.0495915412902832, "kl": 1.3418261259794235, "learning_rate": 5.2539856787280114e-05, "loss": -0.0806, "num_tokens": 7022620.0, "reward": -0.42317694425582886, "reward_std": 0.22051481902599335, "rewards/rollout_reward_func/mean": -0.42317694425582886, "rewards/rollout_reward_func/std": 0.24051433801651, "sampling/importance_sampling_ratio/max": 1.6022220849990845, "sampling/importance_sampling_ratio/mean": 1.0013232231140137, "sampling/importance_sampling_ratio/min": 0.6437124609947205, "sampling/sampling_logp_difference/max": 0.4713914394378662, "sampling/sampling_logp_difference/mean": 0.019625240936875343, "step": 215, "step_time": 23.670126053997592 }, { "clip_ratio/high_max": 0.02929159300401807, "clip_ratio/high_mean": 0.018675401690416038, "clip_ratio/low_mean": 0.02217663876945153, "clip_ratio/low_min": 0.004441824043169618, "clip_ratio/region_mean": 0.04085203958675265, "entropy": 0.3406996987760067, "epoch": 0.00216, "grad_norm": 0.6607075929641724, "kl": 1.3299503177404404, "learning_rate": 5.235041513150506e-05, "loss": -0.0881, "step": 216, "step_time": 6.063559613998223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1519.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 1353.09375, "completions/mean_terminated_length": 1350.0667724609375, "completions/min_length": 1188.0, "completions/min_terminated_length": 1188.0, "entropy": 0.4072920363396406, "epoch": 0.00217, "frac_reward_zero_std": 0.0, "grad_norm": 1.1069542169570923, "kl": 1.3975814431905746, "learning_rate": 5.216038371856763e-05, "loss": 0.0799, "num_tokens": 7091429.0, "reward": -0.37214675545692444, "reward_std": 0.30217793583869934, "rewards/rollout_reward_func/mean": -0.37214675545692444, "rewards/rollout_reward_func/std": 0.3419589102268219, "sampling/importance_sampling_ratio/max": 1.4979000091552734, "sampling/importance_sampling_ratio/mean": 1.0010831356048584, "sampling/importance_sampling_ratio/min": 0.6897256374359131, "sampling/sampling_logp_difference/max": 0.4040641784667969, "sampling/sampling_logp_difference/mean": 0.023276949301362038, "step": 217, "step_time": 22.50060009000299 }, { "clip_ratio/high_max": 0.029155281372368336, "clip_ratio/high_mean": 0.01856573624536395, "clip_ratio/low_mean": 0.021492298925295472, "clip_ratio/low_min": 0.00613717723172158, "clip_ratio/region_mean": 0.040058034704998136, "entropy": 0.4181174226105213, "epoch": 0.00218, "grad_norm": 0.8002489805221558, "kl": 1.3754362612962723, "learning_rate": 5.196977181033079e-05, "loss": 0.0647, "step": 218, "step_time": 5.2282358720003685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1572.0, "completions/max_terminated_length": 1572.0, "completions/mean_length": 1397.78125, "completions/mean_terminated_length": 1395.0689697265625, "completions/min_length": 1205.0, "completions/min_terminated_length": 1205.0, "entropy": 0.35726359859108925, "epoch": 0.00219, "frac_reward_zero_std": 0.0, "grad_norm": 1.1676479578018188, "kl": 1.2419040352106094, "learning_rate": 5.177858869695006e-05, "loss": 0.0711, "num_tokens": 7161762.0, "reward": -0.4244651198387146, "reward_std": 0.2482985407114029, "rewards/rollout_reward_func/mean": -0.4244651198387146, "rewards/rollout_reward_func/std": 0.2593657672405243, "sampling/importance_sampling_ratio/max": 1.6375157833099365, "sampling/importance_sampling_ratio/mean": 0.9999154806137085, "sampling/importance_sampling_ratio/min": 0.5514493584632874, "sampling/sampling_logp_difference/max": 0.5952053070068359, "sampling/sampling_logp_difference/mean": 0.019869744777679443, "step": 219, "step_time": 22.470976016998975 }, { "clip_ratio/high_max": 0.020117249921895564, "clip_ratio/high_mean": 0.014329458412248641, "clip_ratio/low_mean": 0.019821486086584628, "clip_ratio/low_min": 0.009452253812924027, "clip_ratio/region_mean": 0.03415094455704093, "entropy": 0.35601820424199104, "epoch": 0.0022, "grad_norm": 0.6768138408660889, "kl": 1.2363557517528534, "learning_rate": 5.158684369642065e-05, "loss": 0.0602, "step": 220, "step_time": 6.1034889859965915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 1400.3125, "completions/mean_terminated_length": 1395.2857666015625, "completions/min_length": 1217.0, "completions/min_terminated_length": 1217.0, "entropy": 0.35076490230858326, "epoch": 0.00221, "frac_reward_zero_std": 0.0, "grad_norm": 0.829781174659729, "kl": 1.2899986803531647, "learning_rate": 5.139454615412339e-05, "loss": -0.0344, "num_tokens": 7232169.0, "reward": -0.2982328534126282, "reward_std": 0.1823849380016327, "rewards/rollout_reward_func/mean": -0.2982328534126282, "rewards/rollout_reward_func/std": 0.18283535540103912, "sampling/importance_sampling_ratio/max": 1.3394988775253296, "sampling/importance_sampling_ratio/mean": 0.9985803365707397, "sampling/importance_sampling_ratio/min": 0.661496639251709, "sampling/sampling_logp_difference/max": 0.4132504463195801, "sampling/sampling_logp_difference/mean": 0.017737243324518204, "step": 221, "step_time": 23.13983517499946 }, { "clip_ratio/high_max": 0.028027038322761655, "clip_ratio/high_mean": 0.015548705996479839, "clip_ratio/low_mean": 0.012465797422919422, "clip_ratio/low_min": 0.0033783784601837397, "clip_ratio/region_mean": 0.028014503477606922, "entropy": 0.3569837845861912, "epoch": 0.00222, "grad_norm": 0.8349130749702454, "kl": 1.4125534296035767, "learning_rate": 5.1201705442369166e-05, "loss": -0.0402, "step": 222, "step_time": 5.324575794998964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 1425.5625, "completions/mean_terminated_length": 1422.72412109375, "completions/min_length": 1180.0, "completions/min_terminated_length": 1180.0, "entropy": 0.42438133619725704, "epoch": 0.00223, "frac_reward_zero_std": 0.0, "grad_norm": 1.35781729221344, "kl": 2.9894486367702484, "learning_rate": 5.100833095994224e-05, "loss": 0.087, "num_tokens": 7303401.0, "reward": -0.31269654631614685, "reward_std": 0.2915668487548828, "rewards/rollout_reward_func/mean": -0.31269654631614685, "rewards/rollout_reward_func/std": 0.3267049193382263, "sampling/importance_sampling_ratio/max": 1.5907464027404785, "sampling/importance_sampling_ratio/mean": 1.0015251636505127, "sampling/importance_sampling_ratio/min": 0.7033774256706238, "sampling/sampling_logp_difference/max": 0.4642033576965332, "sampling/sampling_logp_difference/mean": 0.021778199821710587, "step": 223, "step_time": 23.25856891500007 }, { "clip_ratio/high_max": 0.024233989184722304, "clip_ratio/high_mean": 0.013036112184636295, "clip_ratio/low_mean": 0.010281345807015896, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023317457758821547, "entropy": 0.4347520284354687, "epoch": 0.00224, "grad_norm": 1.1930365562438965, "kl": 1.957970529794693, "learning_rate": 5.081443213164207e-05, "loss": 0.075, "step": 224, "step_time": 5.316070877997845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 1436.09375, "completions/mean_terminated_length": 1433.7667236328125, "completions/min_length": 1302.0, "completions/min_terminated_length": 1302.0, "entropy": 0.4418147075921297, "epoch": 0.00225, "frac_reward_zero_std": 0.0, "grad_norm": 0.8514827489852905, "kl": 1.1649749651551247, "learning_rate": 5.0620018407824e-05, "loss": 0.0107, "num_tokens": 7374964.0, "reward": -0.4054383933544159, "reward_std": 0.29173243045806885, "rewards/rollout_reward_func/mean": -0.4054383933544159, "rewards/rollout_reward_func/std": 0.2957523763179779, "sampling/importance_sampling_ratio/max": 1.4339555501937866, "sampling/importance_sampling_ratio/mean": 0.9982475638389587, "sampling/importance_sampling_ratio/min": 0.776236891746521, "sampling/sampling_logp_difference/max": 0.36043667793273926, "sampling/sampling_logp_difference/mean": 0.020644396543502808, "step": 225, "step_time": 24.384883568003715 }, { "clip_ratio/high_max": 0.034377373522147536, "clip_ratio/high_mean": 0.019839379470795393, "clip_ratio/low_mean": 0.02329024113714695, "clip_ratio/low_min": 0.009065793012268841, "clip_ratio/region_mean": 0.043129620840772986, "entropy": 0.4414524957537651, "epoch": 0.00226, "grad_norm": 0.5920740962028503, "kl": 1.1851855516433716, "learning_rate": 5.042509926393865e-05, "loss": -0.0045, "step": 226, "step_time": 5.392359794001095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 1363.125, "completions/mean_terminated_length": 1358.9676513671875, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "entropy": 0.37177780643105507, "epoch": 0.00227, "frac_reward_zero_std": 0.0, "grad_norm": 0.8064789772033691, "kl": 1.3081348538398743, "learning_rate": 5.0229684200070084e-05, "loss": -0.1274, "num_tokens": 7444150.0, "reward": -0.4020261764526367, "reward_std": 0.330782949924469, "rewards/rollout_reward_func/mean": -0.4020261764526367, "rewards/rollout_reward_func/std": 0.32456374168395996, "sampling/importance_sampling_ratio/max": 1.9573839902877808, "sampling/importance_sampling_ratio/mean": 1.0011073350906372, "sampling/importance_sampling_ratio/min": 0.6370452046394348, "sampling/sampling_logp_difference/max": 0.6716089248657227, "sampling/sampling_logp_difference/mean": 0.01980571448802948, "step": 227, "step_time": 22.293594700999165 }, { "clip_ratio/high_max": 0.042878018342889845, "clip_ratio/high_mean": 0.024638742150273174, "clip_ratio/low_mean": 0.012293477193452418, "clip_ratio/low_min": 0.00657894741743803, "clip_ratio/region_mean": 0.036932219460140914, "entropy": 0.374892883002758, "epoch": 0.00228, "grad_norm": 0.5232493281364441, "kl": 1.2986047714948654, "learning_rate": 5.003378274047285e-05, "loss": -0.1355, "step": 228, "step_time": 5.343615432000661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 1393.875, "completions/mean_terminated_length": 1388.966796875, "completions/min_length": 1243.0, "completions/min_terminated_length": 1243.0, "entropy": 0.31629947386682034, "epoch": 0.00229, "frac_reward_zero_std": 0.0, "grad_norm": 0.7163617610931396, "kl": 1.079894870519638, "learning_rate": 4.983740443310771e-05, "loss": 0.0002, "num_tokens": 7514349.0, "reward": -0.3907000422477722, "reward_std": 0.20461291074752808, "rewards/rollout_reward_func/mean": -0.3907000422477722, "rewards/rollout_reward_func/std": 0.2138475477695465, "sampling/importance_sampling_ratio/max": 1.6370956897735596, "sampling/importance_sampling_ratio/mean": 0.9999720454216003, "sampling/importance_sampling_ratio/min": 0.7024121880531311, "sampling/sampling_logp_difference/max": 0.4929237365722656, "sampling/sampling_logp_difference/mean": 0.015373698435723782, "step": 229, "step_time": 23.203826416001903 }, { "clip_ratio/high_max": 0.0229482629802078, "clip_ratio/high_mean": 0.015278564882464707, "clip_ratio/low_mean": 0.019702481105923653, "clip_ratio/low_min": 0.007566824089735746, "clip_ratio/region_mean": 0.034981045639142394, "entropy": 0.3328620679676533, "epoch": 0.0023, "grad_norm": 0.5242230296134949, "kl": 1.0355066135525703, "learning_rate": 4.964055884917629e-05, "loss": -0.0077, "step": 230, "step_time": 6.108329704999051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 1383.84375, "completions/mean_terminated_length": 1383.84375, "completions/min_length": 1163.0, "completions/min_terminated_length": 1163.0, "entropy": 0.5211316458880901, "epoch": 0.00231, "frac_reward_zero_std": 0.0, "grad_norm": 0.6902910470962524, "kl": 1.2462179064750671, "learning_rate": 4.9443255582654684e-05, "loss": -0.0183, "num_tokens": 7584180.0, "reward": -0.42461490631103516, "reward_std": 0.3341720700263977, "rewards/rollout_reward_func/mean": -0.42461490631103516, "rewards/rollout_reward_func/std": 0.3409440219402313, "sampling/importance_sampling_ratio/max": 1.2403837442398071, "sampling/importance_sampling_ratio/mean": 0.9999593496322632, "sampling/importance_sampling_ratio/min": 0.7899606823921204, "sampling/sampling_logp_difference/max": 0.23577213287353516, "sampling/sampling_logp_difference/mean": 0.019728079438209534, "step": 231, "step_time": 23.07244206899486 }, { "clip_ratio/high_max": 0.025119478348642588, "clip_ratio/high_mean": 0.013918434968218207, "clip_ratio/low_mean": 0.019959259836468846, "clip_ratio/low_min": 0.004535147454589605, "clip_ratio/region_mean": 0.03387769451364875, "entropy": 0.5446676462888718, "epoch": 0.00232, "grad_norm": 0.5429065227508545, "kl": 1.2582595199346542, "learning_rate": 4.924550424982572e-05, "loss": -0.0298, "step": 232, "step_time": 5.346337181999843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 1379.375, "completions/mean_terminated_length": 1362.9583740234375, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "entropy": 0.7182583063840866, "epoch": 0.00233, "frac_reward_zero_std": 0.0, "grad_norm": 1.053868055343628, "kl": 1.2034199237823486, "learning_rate": 4.9047314488810387e-05, "loss": -0.3184, "num_tokens": 7653855.0, "reward": -0.408155620098114, "reward_std": 0.34933462738990784, "rewards/rollout_reward_func/mean": -0.408155620098114, "rewards/rollout_reward_func/std": 0.3592686355113983, "sampling/importance_sampling_ratio/max": 1.9485491514205933, "sampling/importance_sampling_ratio/mean": 1.0011026859283447, "sampling/importance_sampling_ratio/min": 0.6393722891807556, "sampling/sampling_logp_difference/max": 0.6670851111412048, "sampling/sampling_logp_difference/mean": 0.028645649552345276, "step": 233, "step_time": 24.891625911997835 }, { "clip_ratio/high_max": 0.04299504484515637, "clip_ratio/high_mean": 0.02823967597214505, "clip_ratio/low_mean": 0.012639836175367236, "clip_ratio/low_min": 0.0033887986792251468, "clip_ratio/region_mean": 0.04087951220571995, "entropy": 0.728181030601263, "epoch": 0.00234, "grad_norm": 0.9483682513237, "kl": 1.2175057232379913, "learning_rate": 4.884869595909802e-05, "loss": -0.3328, "step": 234, "step_time": 6.637629879001906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 1377.0625, "completions/mean_terminated_length": 1379.7857666015625, "completions/min_length": 1234.0, "completions/min_terminated_length": 1234.0, "entropy": 0.6216042265295982, "epoch": 0.00235, "frac_reward_zero_std": 0.0, "grad_norm": 1.1155496835708618, "kl": 1.3805908933281898, "learning_rate": 4.864965834107557e-05, "loss": 0.1285, "num_tokens": 7723433.0, "reward": -0.4682697355747223, "reward_std": 0.20931431651115417, "rewards/rollout_reward_func/mean": -0.4682697355747223, "rewards/rollout_reward_func/std": 0.21837398409843445, "sampling/importance_sampling_ratio/max": 1.7207974195480347, "sampling/importance_sampling_ratio/mean": 0.9958610534667969, "sampling/importance_sampling_ratio/min": 0.07614826411008835, "sampling/sampling_logp_difference/max": 2.575073003768921, "sampling/sampling_logp_difference/mean": 0.034567344933748245, "step": 235, "step_time": 23.16137045100004 }, { "clip_ratio/high_max": 0.033991452190093696, "clip_ratio/high_mean": 0.016995726095046848, "clip_ratio/low_mean": 0.014562130731064826, "clip_ratio/low_min": 0.0043223180109634995, "clip_ratio/region_mean": 0.03155785665148869, "entropy": 0.6139450222253799, "epoch": 0.00236, "grad_norm": 0.8477568030357361, "kl": 1.3915661424398422, "learning_rate": 4.845021133555572e-05, "loss": 0.1155, "step": 236, "step_time": 5.294515454996144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 1369.1875, "completions/mean_terminated_length": 1350.45458984375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "entropy": 0.5727397054433823, "epoch": 0.00237, "frac_reward_zero_std": 0.0, "grad_norm": 0.9306647181510925, "kl": 1.2254540473222733, "learning_rate": 4.825036466330415e-05, "loss": -0.2639, "num_tokens": 7792807.0, "reward": -0.42224448919296265, "reward_std": 0.2746766209602356, "rewards/rollout_reward_func/mean": -0.42224448919296265, "rewards/rollout_reward_func/std": 0.2702217102050781, "sampling/importance_sampling_ratio/max": 1.405310034751892, "sampling/importance_sampling_ratio/mean": 1.000360369682312, "sampling/importance_sampling_ratio/min": 0.747802734375, "sampling/sampling_logp_difference/max": 0.3402578830718994, "sampling/sampling_logp_difference/mean": 0.02280454710125923, "step": 237, "step_time": 23.21407444700344 }, { "clip_ratio/high_max": 0.023022782639600337, "clip_ratio/high_mean": 0.01798958278959617, "clip_ratio/low_mean": 0.008867095166351646, "clip_ratio/low_min": 0.0026909965090453625, "clip_ratio/region_mean": 0.026856677839532495, "entropy": 0.5645197480916977, "epoch": 0.00238, "grad_norm": 0.6847227215766907, "kl": 1.223723091185093, "learning_rate": 4.805012806456572e-05, "loss": -0.2755, "step": 238, "step_time": 5.314819090001038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 1417.0625, "completions/mean_terminated_length": 1411.615478515625, "completions/min_length": 1314.0, "completions/min_terminated_length": 1314.0, "entropy": 0.6046967469155788, "epoch": 0.00239, "frac_reward_zero_std": 0.0, "grad_norm": 1.2144103050231934, "kl": 1.2632278501987457, "learning_rate": 4.784951129858976e-05, "loss": 0.1203, "num_tokens": 7863729.0, "reward": -0.3652339577674866, "reward_std": 0.2815146744251251, "rewards/rollout_reward_func/mean": -0.3652339577674866, "rewards/rollout_reward_func/std": 0.29634273052215576, "sampling/importance_sampling_ratio/max": 1.3012874126434326, "sampling/importance_sampling_ratio/mean": 0.9974876642227173, "sampling/importance_sampling_ratio/min": 0.5004737973213196, "sampling/sampling_logp_difference/max": 0.6922000646591187, "sampling/sampling_logp_difference/mean": 0.027356434613466263, "step": 239, "step_time": 25.902357285000107 }, { "clip_ratio/high_max": 0.025032905163243413, "clip_ratio/high_mean": 0.013338820892386138, "clip_ratio/low_mean": 0.016166088986210525, "clip_ratio/low_min": 0.010341217974200845, "clip_ratio/region_mean": 0.02950490964576602, "entropy": 0.5938820876181126, "epoch": 0.0024, "grad_norm": 0.9507948160171509, "kl": 1.2174823880195618, "learning_rate": 4.764852414315444e-05, "loss": 0.1009, "step": 240, "step_time": 5.3694163159980235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1840.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1697.0625, "completions/mean_terminated_length": 1689.71435546875, "completions/min_length": 1544.0, "completions/min_terminated_length": 1544.0, "entropy": 0.6850555390119553, "epoch": 0.00241, "frac_reward_zero_std": 0.0, "grad_norm": 1.2227540016174316, "kl": 1.0749512538313866, "learning_rate": 4.7447176394090105e-05, "loss": 0.3836, "num_tokens": 7943522.0, "reward": -0.34811559319496155, "reward_std": 0.26145875453948975, "rewards/rollout_reward_func/mean": -0.34811559319496155, "rewards/rollout_reward_func/std": 0.2614726722240448, "sampling/importance_sampling_ratio/max": 1.6382907629013062, "sampling/importance_sampling_ratio/mean": 1.0000560283660889, "sampling/importance_sampling_ratio/min": 0.45848941802978516, "sampling/sampling_logp_difference/max": 0.779818058013916, "sampling/sampling_logp_difference/mean": 0.028249643743038177, "step": 241, "step_time": 28.95033874699766 }, { "clip_ratio/high_max": 0.010906706214882433, "clip_ratio/high_mean": 0.006224958051461726, "clip_ratio/low_mean": 0.007575580908451229, "clip_ratio/low_min": 0.0016778523568063974, "clip_ratio/region_mean": 0.013800538814393803, "entropy": 0.6763835772871971, "epoch": 0.00242, "grad_norm": 1.0954372882843018, "kl": 1.0576452687382698, "learning_rate": 4.724547786480204e-05, "loss": 0.364, "step": 242, "step_time": 5.935986443002548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1881.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 1696.59375, "completions/mean_terminated_length": 1655.4583740234375, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "entropy": 0.5860431045293808, "epoch": 0.00243, "frac_reward_zero_std": 0.0, "grad_norm": 1.199702501296997, "kl": 1.4102365896105766, "learning_rate": 4.7043438385792006e-05, "loss": -0.1597, "num_tokens": 8023376.0, "reward": -0.34889882802963257, "reward_std": 0.27960002422332764, "rewards/rollout_reward_func/mean": -0.34889882802963257, "rewards/rollout_reward_func/std": 0.27587223052978516, "sampling/importance_sampling_ratio/max": 1.6326786279678345, "sampling/importance_sampling_ratio/mean": 0.9994728565216064, "sampling/importance_sampling_ratio/min": 0.31338557600975037, "sampling/sampling_logp_difference/max": 1.1603209972381592, "sampling/sampling_logp_difference/mean": 0.030022799968719482, "step": 243, "step_time": 27.69197043299937 }, { "clip_ratio/high_max": 0.02898611791897565, "clip_ratio/high_mean": 0.01962472900049761, "clip_ratio/low_mean": 0.02425838867202401, "clip_ratio/low_min": 0.009923528297804296, "clip_ratio/region_mean": 0.04388311807997525, "entropy": 0.5802895128726959, "epoch": 0.00244, "grad_norm": 0.9481989741325378, "kl": 1.3910756707191467, "learning_rate": 4.684106780417915e-05, "loss": -0.1787, "step": 244, "step_time": 7.2959318599969265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1894.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 1691.03125, "completions/mean_terminated_length": 1703.7999267578125, "completions/min_length": 1012.0, "completions/min_terminated_length": 1148.0, "entropy": 0.6190878264605999, "epoch": 0.00245, "frac_reward_zero_std": 0.0, "grad_norm": 1.1246025562286377, "kl": 1.0698552653193474, "learning_rate": 4.6638375983220126e-05, "loss": -0.3616, "num_tokens": 8102996.0, "reward": -0.31357449293136597, "reward_std": 0.3069915473461151, "rewards/rollout_reward_func/mean": -0.31357449293136597, "rewards/rollout_reward_func/std": 0.3354884386062622, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9998611211776733, "sampling/importance_sampling_ratio/min": 0.13376212120056152, "sampling/sampling_logp_difference/max": 2.199965238571167, "sampling/sampling_logp_difference/mean": 0.034452907741069794, "step": 245, "step_time": 28.37076109499867 }, { "clip_ratio/high_max": 0.019543186179362237, "clip_ratio/high_mean": 0.012175745039712638, "clip_ratio/low_mean": 0.010016661020927131, "clip_ratio/low_min": 0.001336898421868682, "clip_ratio/region_mean": 0.022192406468093395, "entropy": 0.6119689978659153, "epoch": 0.00246, "grad_norm": 0.9760249853134155, "kl": 1.1114588901400566, "learning_rate": 4.643537280182833e-05, "loss": -0.3749, "step": 246, "step_time": 6.104148196996903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 1720.53125, "completions/mean_terminated_length": 1717.2962646484375, "completions/min_length": 1338.0, "completions/min_terminated_length": 1338.0, "entropy": 0.5090821720659733, "epoch": 0.00247, "frac_reward_zero_std": 0.0, "grad_norm": 1.1742455959320068, "kl": 1.3592460826039314, "learning_rate": 4.6232068154092444e-05, "loss": 0.1438, "num_tokens": 8183603.0, "reward": -0.40345919132232666, "reward_std": 0.2762112319469452, "rewards/rollout_reward_func/mean": -0.40345919132232666, "rewards/rollout_reward_func/std": 0.27208802103996277, "sampling/importance_sampling_ratio/max": 2.093928337097168, "sampling/importance_sampling_ratio/mean": 0.9985787868499756, "sampling/importance_sampling_ratio/min": 0.2927846312522888, "sampling/sampling_logp_difference/max": 1.2283179759979248, "sampling/sampling_logp_difference/mean": 0.029695749282836914, "step": 247, "step_time": 29.182437593999566 }, { "clip_ratio/high_max": 0.021119080018252134, "clip_ratio/high_mean": 0.014775227755308151, "clip_ratio/low_mean": 0.010368433955591172, "clip_ratio/low_min": 0.001660590001847595, "clip_ratio/region_mean": 0.02514366141986102, "entropy": 0.5020915269851685, "epoch": 0.00248, "grad_norm": 0.9137567281723022, "kl": 1.311977319419384, "learning_rate": 4.6028471948794166e-05, "loss": 0.1268, "step": 248, "step_time": 6.024768344997938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1920.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1761.0625, "completions/mean_terminated_length": 1743.3043212890625, "completions/min_length": 1600.0, "completions/min_terminated_length": 1600.0, "entropy": 0.6087570525705814, "epoch": 0.00249, "frac_reward_zero_std": 0.0, "grad_norm": 1.2794415950775146, "kl": 1.193834237754345, "learning_rate": 4.582459410892531e-05, "loss": 0.1133, "num_tokens": 8265525.0, "reward": -0.3630402088165283, "reward_std": 0.16380134224891663, "rewards/rollout_reward_func/mean": -0.3630402088165283, "rewards/rollout_reward_func/std": 0.1817581206560135, "sampling/importance_sampling_ratio/max": 1.8391727209091187, "sampling/importance_sampling_ratio/mean": 1.0007237195968628, "sampling/importance_sampling_ratio/min": 0.5232561826705933, "sampling/sampling_logp_difference/max": 0.6476840972900391, "sampling/sampling_logp_difference/mean": 0.026726508513092995, "step": 249, "step_time": 30.742992465997304 }, { "clip_ratio/high_max": 0.018604300217702985, "clip_ratio/high_mean": 0.010120857506990433, "clip_ratio/low_mean": 0.01420485248672776, "clip_ratio/low_min": 0.006059854524210095, "clip_ratio/region_mean": 0.02432571043027565, "entropy": 0.6037475690245628, "epoch": 0.0025, "grad_norm": 0.9693906307220459, "kl": 1.1554812043905258, "learning_rate": 4.562044457120416e-05, "loss": 0.0995, "step": 250, "step_time": 6.18396285199924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 1728.1875, "completions/mean_terminated_length": 1732.1429443359375, "completions/min_length": 1379.0, "completions/min_terminated_length": 1581.0, "entropy": 0.5614854246377945, "epoch": 0.00251, "frac_reward_zero_std": 0.0, "grad_norm": 1.6389081478118896, "kl": 1.111307829618454, "learning_rate": 4.5416033285591175e-05, "loss": -0.0009, "num_tokens": 8346350.0, "reward": -0.39910563826560974, "reward_std": 0.2510056495666504, "rewards/rollout_reward_func/mean": -0.39910563826560974, "rewards/rollout_reward_func/std": 0.2463119924068451, "sampling/importance_sampling_ratio/max": 1.686388611793518, "sampling/importance_sampling_ratio/mean": 0.998680830001831, "sampling/importance_sampling_ratio/min": 0.49532800912857056, "sampling/sampling_logp_difference/max": 0.7025351524353027, "sampling/sampling_logp_difference/mean": 0.02717345580458641, "step": 251, "step_time": 30.680251233998206 }, { "clip_ratio/high_max": 0.019352586299646646, "clip_ratio/high_mean": 0.012517552357167006, "clip_ratio/low_mean": 0.010492240427993238, "clip_ratio/low_min": 0.005245084525085986, "clip_ratio/region_mean": 0.02300979197025299, "entropy": 0.5570769980549812, "epoch": 0.00252, "grad_norm": 1.3096987009048462, "kl": 1.0998517125844955, "learning_rate": 4.521137021480404e-05, "loss": -0.0208, "step": 252, "step_time": 6.076920389999941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1827.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 1677.9375, "completions/mean_terminated_length": 1665.571533203125, "completions/min_length": 1134.0, "completions/min_terminated_length": 1134.0, "entropy": 0.45975279435515404, "epoch": 0.00253, "frac_reward_zero_std": 0.0, "grad_norm": 1.1139249801635742, "kl": 1.3870538547635078, "learning_rate": 4.5006465333832084e-05, "loss": 0.4673, "num_tokens": 8425532.0, "reward": -0.2921108305454254, "reward_std": 0.33665600419044495, "rewards/rollout_reward_func/mean": -0.2921108305454254, "rewards/rollout_reward_func/std": 0.3631843626499176, "sampling/importance_sampling_ratio/max": 2.0037894248962402, "sampling/importance_sampling_ratio/mean": 1.002892017364502, "sampling/importance_sampling_ratio/min": 0.3306291699409485, "sampling/sampling_logp_difference/max": 1.1067578792572021, "sampling/sampling_logp_difference/mean": 0.0269552581012249, "step": 253, "step_time": 28.641130245001477 }, { "clip_ratio/high_max": 0.011471019824966788, "clip_ratio/high_mean": 0.005735509912483394, "clip_ratio/low_mean": 0.01236050232546404, "clip_ratio/low_min": 0.005589257925748825, "clip_ratio/region_mean": 0.018096011830493808, "entropy": 0.45960570871829987, "epoch": 0.00254, "grad_norm": 0.8957400321960449, "kl": 1.3566706702113152, "learning_rate": 4.4801328629450137e-05, "loss": 0.4514, "step": 254, "step_time": 6.77171369899952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 1785.53125, "completions/mean_terminated_length": 1786.10009765625, "completions/min_length": 1698.0, "completions/min_terminated_length": 1698.0, "entropy": 0.4569433890283108, "epoch": 0.00255, "frac_reward_zero_std": 0.0, "grad_norm": 1.3043508529663086, "kl": 1.2680969387292862, "learning_rate": 4.459597009973175e-05, "loss": 0.2394, "num_tokens": 8508349.0, "reward": -0.3875284790992737, "reward_std": 0.1898951381444931, "rewards/rollout_reward_func/mean": -0.3875284790992737, "rewards/rollout_reward_func/std": 0.19851893186569214, "sampling/importance_sampling_ratio/max": 1.4133907556533813, "sampling/importance_sampling_ratio/mean": 0.9994107484817505, "sampling/importance_sampling_ratio/min": 0.7249796986579895, "sampling/sampling_logp_difference/max": 0.3459916114807129, "sampling/sampling_logp_difference/mean": 0.022265754640102386, "step": 255, "step_time": 27.762585487997057 }, { "clip_ratio/high_max": 0.028818156104534864, "clip_ratio/high_mean": 0.018654855899512768, "clip_ratio/low_mean": 0.013128510676324368, "clip_ratio/low_min": 0.0044964030385017395, "clip_ratio/region_mean": 0.031783365906449035, "entropy": 0.45888680778443813, "epoch": 0.00256, "grad_norm": 0.9642385244369507, "kl": 1.2335847467184067, "learning_rate": 4.439039975356194e-05, "loss": 0.2257, "step": 256, "step_time": 6.0675096189988835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1884.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 1677.875, "completions/mean_terminated_length": 1678.4334716796875, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "entropy": 0.44360771775245667, "epoch": 0.00257, "frac_reward_zero_std": 0.0, "grad_norm": 1.289170503616333, "kl": 1.3471060693264008, "learning_rate": 4.418462761014939e-05, "loss": 0.1535, "num_tokens": 8587604.0, "reward": -0.3666492700576782, "reward_std": 0.2807188630104065, "rewards/rollout_reward_func/mean": -0.3666492700576782, "rewards/rollout_reward_func/std": 0.30381450057029724, "sampling/importance_sampling_ratio/max": 1.4430872201919556, "sampling/importance_sampling_ratio/mean": 1.0010987520217896, "sampling/importance_sampling_ratio/min": 0.6292256712913513, "sampling/sampling_logp_difference/max": 0.4632652997970581, "sampling/sampling_logp_difference/mean": 0.0242949016392231, "step": 257, "step_time": 27.15361070200197 }, { "clip_ratio/high_max": 0.03136689949315041, "clip_ratio/high_mean": 0.019189550366718322, "clip_ratio/low_mean": 0.015866847592405975, "clip_ratio/low_min": 0.005305602680891752, "clip_ratio/region_mean": 0.035056397784501314, "entropy": 0.43971412628889084, "epoch": 0.00258, "grad_norm": 0.8005442023277283, "kl": 1.3311180174350739, "learning_rate": 4.397866369853805e-05, "loss": 0.1375, "step": 258, "step_time": 6.565812303995699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 1712.375, "completions/mean_terminated_length": 1712.375, "completions/min_length": 1064.0, "completions/min_terminated_length": 1064.0, "entropy": 0.3554973527789116, "epoch": 0.00259, "frac_reward_zero_std": 0.0, "grad_norm": 0.9519293308258057, "kl": 1.5265100672841072, "learning_rate": 4.3772518057118404e-05, "loss": -0.1086, "num_tokens": 8668020.0, "reward": -0.38971930742263794, "reward_std": 0.31382888555526733, "rewards/rollout_reward_func/mean": -0.38971930742263794, "rewards/rollout_reward_func/std": 0.3226642608642578, "sampling/importance_sampling_ratio/max": 1.4233710765838623, "sampling/importance_sampling_ratio/mean": 1.0004098415374756, "sampling/importance_sampling_ratio/min": 0.17093463242053986, "sampling/sampling_logp_difference/max": 1.7664740085601807, "sampling/sampling_logp_difference/mean": 0.022783301770687103, "step": 259, "step_time": 27.792880233002506 }, { "clip_ratio/high_max": 0.02593483787495643, "clip_ratio/high_mean": 0.01678526250179857, "clip_ratio/low_mean": 0.015869285445660353, "clip_ratio/low_min": 0.004712301655672491, "clip_ratio/region_mean": 0.0326545478310436, "entropy": 0.3470265008509159, "epoch": 0.0026, "grad_norm": 0.6780015230178833, "kl": 1.6021624654531479, "learning_rate": 4.3566200733138176e-05, "loss": -0.1143, "step": 260, "step_time": 6.027849960999447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1707.65625, "completions/mean_terminated_length": 1704.300048828125, "completions/min_length": 1579.0, "completions/min_terminated_length": 1579.0, "entropy": 0.3321564346551895, "epoch": 0.00261, "frac_reward_zero_std": 0.0, "grad_norm": 0.8901036977767944, "kl": 1.3136900514364243, "learning_rate": 4.335972178221269e-05, "loss": -0.0091, "num_tokens": 8748213.0, "reward": -0.3292140066623688, "reward_std": 0.28540170192718506, "rewards/rollout_reward_func/mean": -0.3292140066623688, "rewards/rollout_reward_func/std": 0.2877996563911438, "sampling/importance_sampling_ratio/max": 1.2949029207229614, "sampling/importance_sampling_ratio/mean": 0.9994099140167236, "sampling/importance_sampling_ratio/min": 0.6516190767288208, "sampling/sampling_logp_difference/max": 0.4282951354980469, "sampling/sampling_logp_difference/mean": 0.017434781417250633, "step": 261, "step_time": 27.197426648002875 }, { "clip_ratio/high_max": 0.021629020106047392, "clip_ratio/high_mean": 0.013081360491923988, "clip_ratio/low_mean": 0.006478912808233872, "clip_ratio/low_min": 0.0029820722993463278, "clip_ratio/region_mean": 0.01956027330015786, "entropy": 0.33012816682457924, "epoch": 0.00262, "grad_norm": 0.6890857815742493, "kl": 1.3043452054262161, "learning_rate": 4.31530912678347e-05, "loss": -0.0203, "step": 262, "step_time": 5.984241919000851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1654.78125, "completions/mean_terminated_length": 1654.78125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "entropy": 0.37759771570563316, "epoch": 0.00263, "frac_reward_zero_std": 0.0, "grad_norm": 0.9726264476776123, "kl": 1.6636318117380142, "learning_rate": 4.2946319260883964e-05, "loss": -0.0824, "num_tokens": 8826708.0, "reward": -0.3082003593444824, "reward_std": 0.3507794439792633, "rewards/rollout_reward_func/mean": -0.3082003593444824, "rewards/rollout_reward_func/std": 0.3486320674419403, "sampling/importance_sampling_ratio/max": 1.3329782485961914, "sampling/importance_sampling_ratio/mean": 1.0000617504119873, "sampling/importance_sampling_ratio/min": 0.5601345896720886, "sampling/sampling_logp_difference/max": 0.579578161239624, "sampling/sampling_logp_difference/mean": 0.020423412322998047, "step": 263, "step_time": 27.634638796997024 }, { "clip_ratio/high_max": 0.026410604477860034, "clip_ratio/high_mean": 0.018214357958640903, "clip_ratio/low_mean": 0.01389897777698934, "clip_ratio/low_min": 0.008039031410589814, "clip_ratio/region_mean": 0.032113335793837905, "entropy": 0.3750726170837879, "epoch": 0.00264, "grad_norm": 0.7206187844276428, "kl": 1.611046925187111, "learning_rate": 4.273941583913639e-05, "loss": -0.0925, "step": 264, "step_time": 5.920993510002518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1686.3125, "completions/mean_terminated_length": 1686.3125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.3661015033721924, "epoch": 0.00265, "frac_reward_zero_std": 0.0, "grad_norm": 0.8947153091430664, "kl": 2.2019295543432236, "learning_rate": 4.253239108677285e-05, "loss": -0.0712, "num_tokens": 8906247.0, "reward": -0.48315995931625366, "reward_std": 0.25976985692977905, "rewards/rollout_reward_func/mean": -0.48315995931625366, "rewards/rollout_reward_func/std": 0.2541763186454773, "sampling/importance_sampling_ratio/max": 1.4508082866668701, "sampling/importance_sampling_ratio/mean": 1.0017142295837402, "sampling/importance_sampling_ratio/min": 0.326316237449646, "sampling/sampling_logp_difference/max": 1.1198883056640625, "sampling/sampling_logp_difference/mean": 0.019335944205522537, "step": 265, "step_time": 26.402459584001917 }, { "clip_ratio/high_max": 0.03338373429141939, "clip_ratio/high_mean": 0.019759851158596575, "clip_ratio/low_mean": 0.018153951270505786, "clip_ratio/low_min": 0.006470588152296841, "clip_ratio/region_mean": 0.037913802661933005, "entropy": 0.3617684543132782, "epoch": 0.00266, "grad_norm": 0.5852524638175964, "kl": 1.9142607226967812, "learning_rate": 4.232525509388772e-05, "loss": -0.0792, "step": 266, "step_time": 5.9868235850026394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 1652.75, "completions/mean_terminated_length": 1652.75, "completions/min_length": 1570.0, "completions/min_terminated_length": 1570.0, "entropy": 0.2530566677451134, "epoch": 0.00267, "frac_reward_zero_std": 0.0, "grad_norm": 0.8562265038490295, "kl": 1.2694031074643135, "learning_rate": 4.2118017955997074e-05, "loss": -0.0126, "num_tokens": 8984600.0, "reward": -0.29276055097579956, "reward_std": 0.2257564812898636, "rewards/rollout_reward_func/mean": -0.29276055097579956, "rewards/rollout_reward_func/std": 0.23103740811347961, "sampling/importance_sampling_ratio/max": 1.5209555625915527, "sampling/importance_sampling_ratio/mean": 1.0015647411346436, "sampling/importance_sampling_ratio/min": 0.7344807386398315, "sampling/sampling_logp_difference/max": 0.41933882236480713, "sampling/sampling_logp_difference/mean": 0.014971786178648472, "step": 267, "step_time": 28.31521664399952 }, { "clip_ratio/high_max": 0.03470594109967351, "clip_ratio/high_mean": 0.021014586789533496, "clip_ratio/low_mean": 0.013055364601314068, "clip_ratio/low_min": 0.00722862035036087, "clip_ratio/region_mean": 0.034069951507262886, "entropy": 0.25966743752360344, "epoch": 0.00268, "grad_norm": 0.567527711391449, "kl": 1.250559814274311, "learning_rate": 4.191068977354662e-05, "loss": -0.0219, "step": 268, "step_time": 6.570512480002435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 1714.1875, "completions/mean_terminated_length": 1710.322509765625, "completions/min_length": 1559.0, "completions/min_terminated_length": 1559.0, "entropy": 0.3354354053735733, "epoch": 0.00269, "frac_reward_zero_std": 0.0, "grad_norm": 1.1204144954681396, "kl": 1.3689760863780975, "learning_rate": 4.1703280651419493e-05, "loss": -0.0911, "num_tokens": 9064991.0, "reward": -0.34254971146583557, "reward_std": 0.2667918801307678, "rewards/rollout_reward_func/mean": -0.34254971146583557, "rewards/rollout_reward_func/std": 0.2693158984184265, "sampling/importance_sampling_ratio/max": 1.5921086072921753, "sampling/importance_sampling_ratio/mean": 1.0010147094726562, "sampling/importance_sampling_ratio/min": 0.6691294312477112, "sampling/sampling_logp_difference/max": 0.4650592803955078, "sampling/sampling_logp_difference/mean": 0.01826593279838562, "step": 269, "step_time": 27.066498756001238 }, { "clip_ratio/high_max": 0.02873426815494895, "clip_ratio/high_mean": 0.015654659655410796, "clip_ratio/low_mean": 0.01913989509921521, "clip_ratio/low_min": 0.004814814776182175, "clip_ratio/region_mean": 0.03479455586057156, "entropy": 0.34724972024559975, "epoch": 0.0027, "grad_norm": 0.7140161395072937, "kl": 1.373668484389782, "learning_rate": 4.149580069844368e-05, "loss": -0.1025, "step": 270, "step_time": 6.010965050998493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 1739.28125, "completions/mean_terminated_length": 1739.28125, "completions/min_length": 1630.0, "completions/min_terminated_length": 1630.0, "entropy": 0.32653932087123394, "epoch": 0.00271, "frac_reward_zero_std": 0.0, "grad_norm": 0.9645451903343201, "kl": 1.221728079020977, "learning_rate": 4.128826002689941e-05, "loss": 0.0375, "num_tokens": 9146274.0, "reward": -0.3229370415210724, "reward_std": 0.21175247430801392, "rewards/rollout_reward_func/mean": -0.3229370415210724, "rewards/rollout_reward_func/std": 0.21965768933296204, "sampling/importance_sampling_ratio/max": 1.4248602390289307, "sampling/importance_sampling_ratio/mean": 1.000783920288086, "sampling/importance_sampling_ratio/min": 0.6550017595291138, "sampling/sampling_logp_difference/max": 0.42311739921569824, "sampling/sampling_logp_difference/mean": 0.018349623307585716, "step": 271, "step_time": 27.265292395999495 }, { "clip_ratio/high_max": 0.025783154647797346, "clip_ratio/high_mean": 0.017376948380842805, "clip_ratio/low_mean": 0.011236148187890649, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.028613096103072166, "entropy": 0.33122148364782333, "epoch": 0.00272, "grad_norm": 0.6647641062736511, "kl": 1.2120093256235123, "learning_rate": 4.108066875202617e-05, "loss": 0.0335, "step": 272, "step_time": 6.528355276999719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1915.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 1692.96875, "completions/mean_terminated_length": 1693.064453125, "completions/min_length": 977.0, "completions/min_terminated_length": 977.0, "entropy": 0.27096978574991226, "epoch": 0.00273, "frac_reward_zero_std": 0.0, "grad_norm": 0.9355181455612183, "kl": 1.1173231899738312, "learning_rate": 4.087303699152987e-05, "loss": -0.1601, "num_tokens": 9226016.0, "reward": -0.3995485305786133, "reward_std": 0.2603399157524109, "rewards/rollout_reward_func/mean": -0.3995485305786133, "rewards/rollout_reward_func/std": 0.2633010745048523, "sampling/importance_sampling_ratio/max": 1.4012361764907837, "sampling/importance_sampling_ratio/mean": 0.9990127086639404, "sampling/importance_sampling_ratio/min": 0.7348133325576782, "sampling/sampling_logp_difference/max": 0.33735477924346924, "sampling/sampling_logp_difference/mean": 0.013222629204392433, "step": 273, "step_time": 27.357047223002155 }, { "clip_ratio/high_max": 0.028645486803725362, "clip_ratio/high_mean": 0.01542893797159195, "clip_ratio/low_mean": 0.006024184171110392, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021453122259117663, "entropy": 0.27417062781751156, "epoch": 0.00274, "grad_norm": 0.6330700516700745, "kl": 1.1309190168976784, "learning_rate": 4.066537486508957e-05, "loss": -0.1689, "step": 274, "step_time": 6.09497125100097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 1697.71875, "completions/mean_terminated_length": 1697.71875, "completions/min_length": 1600.0, "completions/min_terminated_length": 1600.0, "entropy": 0.401613038033247, "epoch": 0.00275, "frac_reward_zero_std": 0.0, "grad_norm": 1.023305058479309, "kl": 1.2632574066519737, "learning_rate": 4.045769249386435e-05, "loss": 0.0887, "num_tokens": 9305869.0, "reward": -0.28507786989212036, "reward_std": 0.21457329392433167, "rewards/rollout_reward_func/mean": -0.28507786989212036, "rewards/rollout_reward_func/std": 0.2277485579252243, "sampling/importance_sampling_ratio/max": 1.366645097732544, "sampling/importance_sampling_ratio/mean": 1.0005452632904053, "sampling/importance_sampling_ratio/min": 0.7387505173683167, "sampling/sampling_logp_difference/max": 0.3123588562011719, "sampling/sampling_logp_difference/mean": 0.01817021705210209, "step": 275, "step_time": 27.42404237899973 }, { "clip_ratio/high_max": 0.018194853328168392, "clip_ratio/high_mean": 0.012183162965811789, "clip_ratio/low_mean": 0.01755013212095946, "clip_ratio/low_min": 0.0052648503333330154, "clip_ratio/region_mean": 0.029733294853940606, "entropy": 0.40009279176592827, "epoch": 0.00276, "grad_norm": 0.8817785978317261, "kl": 1.2845247387886047, "learning_rate": 4.0249999999999996e-05, "loss": 0.0804, "step": 276, "step_time": 5.9381952450003155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 1729.71875, "completions/mean_terminated_length": 1729.71875, "completions/min_length": 1547.0, "completions/min_terminated_length": 1547.0, "entropy": 0.3658145181834698, "epoch": 0.00277, "frac_reward_zero_std": 0.0, "grad_norm": 0.856060266494751, "kl": 1.4524849355220795, "learning_rate": 4.004230750613564e-05, "loss": 0.0671, "num_tokens": 9386807.0, "reward": -0.29274478554725647, "reward_std": 0.2496490329504013, "rewards/rollout_reward_func/mean": -0.29274478554725647, "rewards/rollout_reward_func/std": 0.2612221837043762, "sampling/importance_sampling_ratio/max": 1.4300185441970825, "sampling/importance_sampling_ratio/mean": 0.9979414343833923, "sampling/importance_sampling_ratio/min": 0.745233416557312, "sampling/sampling_logp_difference/max": 0.3576873540878296, "sampling/sampling_logp_difference/mean": 0.018407320603728294, "step": 277, "step_time": 27.448430756998278 }, { "clip_ratio/high_max": 0.016709288815036416, "clip_ratio/high_mean": 0.012048516189679503, "clip_ratio/low_mean": 0.015901208273135126, "clip_ratio/low_min": 0.009036728064529598, "clip_ratio/region_mean": 0.02794972446281463, "entropy": 0.36612502858042717, "epoch": 0.00278, "grad_norm": 0.6600938439369202, "kl": 1.463659793138504, "learning_rate": 3.983462513491042e-05, "loss": 0.058, "step": 278, "step_time": 6.881747132998498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1894.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 1740.5, "completions/mean_terminated_length": 1740.5, "completions/min_length": 1575.0, "completions/min_terminated_length": 1575.0, "entropy": 0.3992572519928217, "epoch": 0.00279, "frac_reward_zero_std": 0.0, "grad_norm": 0.8840353488922119, "kl": 1.4135135263204575, "learning_rate": 3.962696300847013e-05, "loss": -0.149, "num_tokens": 9468108.0, "reward": -0.32793790102005005, "reward_std": 0.2324901819229126, "rewards/rollout_reward_func/mean": -0.32793790102005005, "rewards/rollout_reward_func/std": 0.24271242320537567, "sampling/importance_sampling_ratio/max": 1.3032965660095215, "sampling/importance_sampling_ratio/mean": 0.9989101886749268, "sampling/importance_sampling_ratio/min": 0.6135318875312805, "sampling/sampling_logp_difference/max": 0.488523006439209, "sampling/sampling_logp_difference/mean": 0.021164804697036743, "step": 279, "step_time": 27.198307661003128 }, { "clip_ratio/high_max": 0.024722397793084383, "clip_ratio/high_mean": 0.012834683759137988, "clip_ratio/low_mean": 0.013479079469107091, "clip_ratio/low_min": 0.0038645146414637566, "clip_ratio/region_mean": 0.026313763111829758, "entropy": 0.40178852155804634, "epoch": 0.0028, "grad_norm": 0.6964399218559265, "kl": 1.436361089348793, "learning_rate": 3.9419331247973824e-05, "loss": -0.1584, "step": 280, "step_time": 6.037880153995502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 1693.875, "completions/mean_terminated_length": 1688.1334228515625, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "entropy": 0.3368613515049219, "epoch": 0.00281, "frac_reward_zero_std": 0.0, "grad_norm": 0.8920314311981201, "kl": 1.262581318616867, "learning_rate": 3.92117399731006e-05, "loss": -0.0255, "num_tokens": 9547870.0, "reward": -0.3265736401081085, "reward_std": 0.2521629333496094, "rewards/rollout_reward_func/mean": -0.3265736401081085, "rewards/rollout_reward_func/std": 0.28114575147628784, "sampling/importance_sampling_ratio/max": 1.4411096572875977, "sampling/importance_sampling_ratio/mean": 0.9987660050392151, "sampling/importance_sampling_ratio/min": 0.47627145051956177, "sampling/sampling_logp_difference/max": 0.7417672872543335, "sampling/sampling_logp_difference/mean": 0.015821367502212524, "step": 281, "step_time": 27.643516633997933 }, { "clip_ratio/high_max": 0.01276142883580178, "clip_ratio/high_mean": 0.009881684265565127, "clip_ratio/low_mean": 0.011586937471292913, "clip_ratio/low_min": 0.0021186440717428923, "clip_ratio/region_mean": 0.021468621911481023, "entropy": 0.33809466660022736, "epoch": 0.00282, "grad_norm": 0.6557103991508484, "kl": 1.2327122539281845, "learning_rate": 3.900419930155632e-05, "loss": -0.0368, "step": 282, "step_time": 6.0523431559977325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1714.625, "completions/mean_terminated_length": 1714.800048828125, "completions/min_length": 1560.0, "completions/min_terminated_length": 1560.0, "entropy": 0.4241502955555916, "epoch": 0.00283, "frac_reward_zero_std": 0.0, "grad_norm": 1.4503082036972046, "kl": 1.2610230892896652, "learning_rate": 3.8796719348580505e-05, "loss": 0.0002, "num_tokens": 9628270.0, "reward": -0.23765935003757477, "reward_std": 0.24567170441150665, "rewards/rollout_reward_func/mean": -0.23765935003757477, "rewards/rollout_reward_func/std": 0.2423768788576126, "sampling/importance_sampling_ratio/max": 1.4063000679016113, "sampling/importance_sampling_ratio/mean": 0.9992272853851318, "sampling/importance_sampling_ratio/min": 0.537064790725708, "sampling/sampling_logp_difference/max": 0.6216365098953247, "sampling/sampling_logp_difference/mean": 0.02089468017220497, "step": 283, "step_time": 28.210079381999094 }, { "clip_ratio/high_max": 0.020595053443685174, "clip_ratio/high_mean": 0.013529925723560154, "clip_ratio/low_mean": 0.01386361091863364, "clip_ratio/low_min": 0.0036616161232814193, "clip_ratio/region_mean": 0.027393536060117185, "entropy": 0.42226066440343857, "epoch": 0.00284, "grad_norm": 1.0542271137237549, "kl": 1.271847128868103, "learning_rate": 3.858931022645337e-05, "loss": -0.01, "step": 284, "step_time": 5.970747958999709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 1728.875, "completions/mean_terminated_length": 1724.8333740234375, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "entropy": 0.2731884568929672, "epoch": 0.00285, "frac_reward_zero_std": 0.0, "grad_norm": 0.921492874622345, "kl": 0.9931524246931076, "learning_rate": 3.8381982044002925e-05, "loss": 0.1576, "num_tokens": 9709181.0, "reward": -0.33811116218566895, "reward_std": 0.29418307542800903, "rewards/rollout_reward_func/mean": -0.33811116218566895, "rewards/rollout_reward_func/std": 0.3273296058177948, "sampling/importance_sampling_ratio/max": 1.280454158782959, "sampling/importance_sampling_ratio/mean": 0.9990848302841187, "sampling/importance_sampling_ratio/min": 0.6121774315834045, "sampling/sampling_logp_difference/max": 0.49073314666748047, "sampling/sampling_logp_difference/mean": 0.013834383338689804, "step": 285, "step_time": 26.82136673500281 }, { "clip_ratio/high_max": 0.025841656606644392, "clip_ratio/high_mean": 0.013967439765110612, "clip_ratio/low_mean": 0.005921122734434903, "clip_ratio/low_min": 0.0007911392604000866, "clip_ratio/region_mean": 0.019888562615960836, "entropy": 0.27296687848865986, "epoch": 0.00286, "grad_norm": 0.7738970518112183, "kl": 0.9535024762153625, "learning_rate": 3.817474490611227e-05, "loss": 0.1462, "step": 286, "step_time": 6.682930117000069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 1704.5, "completions/mean_terminated_length": 1700.064453125, "completions/min_length": 1545.0, "completions/min_terminated_length": 1545.0, "entropy": 0.32749482057988644, "epoch": 0.00287, "frac_reward_zero_std": 0.0, "grad_norm": 1.4101401567459106, "kl": 1.0584971904754639, "learning_rate": 3.7967608913227136e-05, "loss": 0.038, "num_tokens": 9789281.0, "reward": -0.332811564207077, "reward_std": 0.20267030596733093, "rewards/rollout_reward_func/mean": -0.332811564207077, "rewards/rollout_reward_func/std": 0.2194613218307495, "sampling/importance_sampling_ratio/max": 1.4552406072616577, "sampling/importance_sampling_ratio/mean": 1.0000221729278564, "sampling/importance_sampling_ratio/min": 0.45661747455596924, "sampling/sampling_logp_difference/max": 0.7839093208312988, "sampling/sampling_logp_difference/mean": 0.018965495750308037, "step": 287, "step_time": 28.089322325999092 }, { "clip_ratio/high_max": 0.016004809644073248, "clip_ratio/high_mean": 0.009625781560316682, "clip_ratio/low_mean": 0.008421280304901302, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01804706157417968, "entropy": 0.326109291985631, "epoch": 0.00288, "grad_norm": 0.8159499168395996, "kl": 1.1019763126969337, "learning_rate": 3.776058416086362e-05, "loss": 0.0305, "step": 288, "step_time": 5.995489212000393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1658.8125, "completions/mean_terminated_length": 1659.354736328125, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "entropy": 0.4703957512974739, "epoch": 0.00289, "frac_reward_zero_std": 0.0, "grad_norm": 5.088852405548096, "kl": 4.509137082844973, "learning_rate": 3.755368073911604e-05, "loss": -0.1305, "num_tokens": 9867853.0, "reward": -0.39782798290252686, "reward_std": 0.26398491859436035, "rewards/rollout_reward_func/mean": -0.39782798290252686, "rewards/rollout_reward_func/std": 0.28180718421936035, "sampling/importance_sampling_ratio/max": 2.576124906539917, "sampling/importance_sampling_ratio/mean": 0.9995816946029663, "sampling/importance_sampling_ratio/min": 0.007747190538793802, "sampling/sampling_logp_difference/max": 4.860424995422363, "sampling/sampling_logp_difference/mean": 0.023618537932634354, "step": 289, "step_time": 27.99678410300112 }, { "clip_ratio/high_max": 0.015644149854779243, "clip_ratio/high_mean": 0.009274220210500062, "clip_ratio/low_mean": 0.003477742022369057, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01275196229107678, "entropy": 0.47243097983300686, "epoch": 0.0029, "grad_norm": 1.7930700778961182, "kl": 2.6588029116392136, "learning_rate": 3.73469087321653e-05, "loss": -0.1587, "step": 290, "step_time": 6.183328474000518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1893.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 1723.4375, "completions/mean_terminated_length": 1721.064453125, "completions/min_length": 1462.0, "completions/min_terminated_length": 1462.0, "entropy": 0.401326060295105, "epoch": 0.00291, "frac_reward_zero_std": 0.0, "grad_norm": 0.9773975610733032, "kl": 1.2011983543634415, "learning_rate": 3.714027821778731e-05, "loss": 0.033, "num_tokens": 9948565.0, "reward": -0.30232518911361694, "reward_std": 0.24841032922267914, "rewards/rollout_reward_func/mean": -0.30232518911361694, "rewards/rollout_reward_func/std": 0.2533982992172241, "sampling/importance_sampling_ratio/max": 2.089590311050415, "sampling/importance_sampling_ratio/mean": 1.0010582208633423, "sampling/importance_sampling_ratio/min": 0.6063990592956543, "sampling/sampling_logp_difference/max": 0.7369680404663086, "sampling/sampling_logp_difference/mean": 0.01909644901752472, "step": 291, "step_time": 27.101977140004237 }, { "clip_ratio/high_max": 0.02591145725455135, "clip_ratio/high_mean": 0.015039061836432666, "clip_ratio/low_mean": 0.011034520342946053, "clip_ratio/low_min": 0.0015060240402817726, "clip_ratio/region_mean": 0.0260735823540017, "entropy": 0.4019428677856922, "epoch": 0.00292, "grad_norm": 0.7517898082733154, "kl": 1.2007444612681866, "learning_rate": 3.6933799266861815e-05, "loss": 0.0226, "step": 292, "step_time": 6.799451634002253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 1655.84375, "completions/mean_terminated_length": 1655.84375, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "entropy": 0.31978612393140793, "epoch": 0.00293, "frac_reward_zero_std": 0.0, "grad_norm": 0.8349912762641907, "kl": 1.2581717148423195, "learning_rate": 3.672748194288159e-05, "loss": 0.0204, "num_tokens": 10027060.0, "reward": -0.29644477367401123, "reward_std": 0.26572006940841675, "rewards/rollout_reward_func/mean": -0.29644477367401123, "rewards/rollout_reward_func/std": 0.2695634663105011, "sampling/importance_sampling_ratio/max": 1.3600651025772095, "sampling/importance_sampling_ratio/mean": 0.9987689256668091, "sampling/importance_sampling_ratio/min": 0.5180136561393738, "sampling/sampling_logp_difference/max": 0.6577537059783936, "sampling/sampling_logp_difference/mean": 0.01630581170320511, "step": 293, "step_time": 26.365434844998163 }, { "clip_ratio/high_max": 0.021274815197102726, "clip_ratio/high_mean": 0.014405200898181647, "clip_ratio/low_mean": 0.004584286129102111, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018989487027283758, "entropy": 0.314215786755085, "epoch": 0.00294, "grad_norm": 0.5438471436500549, "kl": 1.260175697505474, "learning_rate": 3.6521336301461944e-05, "loss": 0.0136, "step": 294, "step_time": 5.968144996000774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1599.1875, "completions/mean_terminated_length": 1598.4193115234375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.28865749202668667, "epoch": 0.00295, "frac_reward_zero_std": 0.0, "grad_norm": 1.04511296749115, "kl": 1.2816387712955475, "learning_rate": 3.63153723898506e-05, "loss": 0.014, "num_tokens": 10103693.0, "reward": -0.316047340631485, "reward_std": 0.30873924493789673, "rewards/rollout_reward_func/mean": -0.316047340631485, "rewards/rollout_reward_func/std": 0.3330863118171692, "sampling/importance_sampling_ratio/max": 1.3749682903289795, "sampling/importance_sampling_ratio/mean": 0.9997192621231079, "sampling/importance_sampling_ratio/min": 0.7635630369186401, "sampling/sampling_logp_difference/max": 0.31843066215515137, "sampling/sampling_logp_difference/mean": 0.01401902362704277, "step": 295, "step_time": 26.13506375200086 }, { "clip_ratio/high_max": 0.019994753878563643, "clip_ratio/high_mean": 0.011093868175521493, "clip_ratio/low_mean": 0.008725065214093775, "clip_ratio/low_min": 0.0036057692486792803, "clip_ratio/region_mean": 0.01981893344782293, "entropy": 0.28587992675602436, "epoch": 0.00296, "grad_norm": 0.7525402307510376, "kl": 1.2671080529689789, "learning_rate": 3.610960024643805e-05, "loss": 0.0038, "step": 296, "step_time": 6.294128641000498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1844.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 1637.71875, "completions/mean_terminated_length": 1637.71875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "entropy": 0.2632557302713394, "epoch": 0.00297, "frac_reward_zero_std": 0.0, "grad_norm": 0.9562009572982788, "kl": 1.3877170458436012, "learning_rate": 3.5904029900268256e-05, "loss": 0.1196, "num_tokens": 10181644.0, "reward": -0.3559083938598633, "reward_std": 0.30061519145965576, "rewards/rollout_reward_func/mean": -0.3559083938598633, "rewards/rollout_reward_func/std": 0.2977730631828308, "sampling/importance_sampling_ratio/max": 1.4149571657180786, "sampling/importance_sampling_ratio/mean": 1.0021570920944214, "sampling/importance_sampling_ratio/min": 0.6622886657714844, "sampling/sampling_logp_difference/max": 0.41205382347106934, "sampling/sampling_logp_difference/mean": 0.013487951830029488, "step": 297, "step_time": 27.507051936001517 }, { "clip_ratio/high_max": 0.026098711881786585, "clip_ratio/high_mean": 0.01388268917798996, "clip_ratio/low_mean": 0.016885705990716815, "clip_ratio/low_min": 0.00592948729172349, "clip_ratio/region_mean": 0.030768395052291453, "entropy": 0.2609469797462225, "epoch": 0.00298, "grad_norm": 0.5750073194503784, "kl": 1.4079841002821922, "learning_rate": 3.569867137054987e-05, "loss": 0.1123, "step": 298, "step_time": 5.972969669001031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 1735.09375, "completions/mean_terminated_length": 1735.09375, "completions/min_length": 1593.0, "completions/min_terminated_length": 1593.0, "entropy": 0.3265743777155876, "epoch": 0.00299, "frac_reward_zero_std": 0.0, "grad_norm": 0.895510733127594, "kl": 1.4302487671375275, "learning_rate": 3.5493534666167914e-05, "loss": 0.0859, "num_tokens": 10262773.0, "reward": -0.3340492844581604, "reward_std": 0.18683116137981415, "rewards/rollout_reward_func/mean": -0.3340492844581604, "rewards/rollout_reward_func/std": 0.19103723764419556, "sampling/importance_sampling_ratio/max": 1.364039421081543, "sampling/importance_sampling_ratio/mean": 1.0008740425109863, "sampling/importance_sampling_ratio/min": 0.431986004114151, "sampling/sampling_logp_difference/max": 0.8393620848655701, "sampling/sampling_logp_difference/mean": 0.01666804775595665, "step": 299, "step_time": 26.551734692999162 }, { "clip_ratio/high_max": 0.02376546198502183, "clip_ratio/high_mean": 0.012808657018467784, "clip_ratio/low_mean": 0.013111507578287274, "clip_ratio/low_min": 0.007722355774603784, "clip_ratio/region_mean": 0.02592016465496272, "entropy": 0.32265283912420273, "epoch": 0.003, "grad_norm": 0.6215419173240662, "kl": 1.405053123831749, "learning_rate": 3.5288629785195956e-05, "loss": 0.0814, "step": 300, "step_time": 6.046183698999812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 1653.09375, "completions/mean_terminated_length": 1653.09375, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "entropy": 0.255836620926857, "epoch": 0.00301, "frac_reward_zero_std": 0.0, "grad_norm": 0.5830058455467224, "kl": 1.1905398815870285, "learning_rate": 3.5083966714408816e-05, "loss": 0.0417, "num_tokens": 10341184.0, "reward": -0.3241254389286041, "reward_std": 0.3267325758934021, "rewards/rollout_reward_func/mean": -0.3241254389286041, "rewards/rollout_reward_func/std": 0.36921945214271545, "sampling/importance_sampling_ratio/max": 1.3117518424987793, "sampling/importance_sampling_ratio/mean": 0.9993005990982056, "sampling/importance_sampling_ratio/min": 0.7946507334709167, "sampling/sampling_logp_difference/max": 0.2713634967803955, "sampling/sampling_logp_difference/mean": 0.01328326016664505, "step": 301, "step_time": 26.65179376499873 }, { "clip_ratio/high_max": 0.01649590115994215, "clip_ratio/high_mean": 0.01096153596881777, "clip_ratio/low_mean": 0.017142816039267927, "clip_ratio/low_min": 0.0025510203558951616, "clip_ratio/region_mean": 0.028104351833462715, "entropy": 0.2549610808491707, "epoch": 0.00302, "grad_norm": 0.3964756429195404, "kl": 1.2078507021069527, "learning_rate": 3.487955542879584e-05, "loss": 0.0343, "step": 302, "step_time": 5.831767339997896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 1713.9375, "completions/mean_terminated_length": 1713.9375, "completions/min_length": 1569.0, "completions/min_terminated_length": 1569.0, "entropy": 0.2520501893013716, "epoch": 0.00303, "frac_reward_zero_std": 0.0, "grad_norm": 0.5240432620048523, "kl": 1.3572960495948792, "learning_rate": 3.467540589107469e-05, "loss": 0.0264, "num_tokens": 10421576.0, "reward": -0.31221023201942444, "reward_std": 0.2563130259513855, "rewards/rollout_reward_func/mean": -0.31221023201942444, "rewards/rollout_reward_func/std": 0.2738856375217438, "sampling/importance_sampling_ratio/max": 1.4016004800796509, "sampling/importance_sampling_ratio/mean": 0.9990121126174927, "sampling/importance_sampling_ratio/min": 0.5832192301750183, "sampling/sampling_logp_difference/max": 0.5391920804977417, "sampling/sampling_logp_difference/mean": 0.01355869323015213, "step": 303, "step_time": 26.62041721699825 }, { "clip_ratio/high_max": 0.015463394112884998, "clip_ratio/high_mean": 0.011773719685152173, "clip_ratio/low_mean": 0.01218449289444834, "clip_ratio/low_min": 0.005635764915496111, "clip_ratio/region_mean": 0.02395821246318519, "entropy": 0.2511089500039816, "epoch": 0.00304, "grad_norm": 0.47363463044166565, "kl": 1.4030948877334595, "learning_rate": 3.447152805120584e-05, "loss": 0.0228, "step": 304, "step_time": 5.989414374000262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 1625.78125, "completions/mean_terminated_length": 1625.78125, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "entropy": 0.2748305927962065, "epoch": 0.00305, "frac_reward_zero_std": 0.0, "grad_norm": 0.7597345113754272, "kl": 1.5798831135034561, "learning_rate": 3.4267931845907555e-05, "loss": -0.0507, "num_tokens": 10499151.0, "reward": -0.25723791122436523, "reward_std": 0.33049023151397705, "rewards/rollout_reward_func/mean": -0.25723791122436523, "rewards/rollout_reward_func/std": 0.3500983417034149, "sampling/importance_sampling_ratio/max": 1.989798903465271, "sampling/importance_sampling_ratio/mean": 1.0004245042800903, "sampling/importance_sampling_ratio/min": 0.426661878824234, "sampling/sampling_logp_difference/max": 0.8517634272575378, "sampling/sampling_logp_difference/mean": 0.01608913764357567, "step": 305, "step_time": 25.832208670999535 }, { "clip_ratio/high_max": 0.021567969117313623, "clip_ratio/high_mean": 0.01301612751558423, "clip_ratio/low_mean": 0.009010696725454181, "clip_ratio/low_min": 0.0021551724057644606, "clip_ratio/region_mean": 0.022026823600754142, "entropy": 0.2775880992412567, "epoch": 0.00306, "grad_norm": 0.4670715928077698, "kl": 1.601256012916565, "learning_rate": 3.4064627198171665e-05, "loss": -0.0558, "step": 306, "step_time": 7.165394652000032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1917.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 1652.0, "completions/mean_terminated_length": 1652.0, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "entropy": 0.28847518004477024, "epoch": 0.00307, "frac_reward_zero_std": 0.0, "grad_norm": 0.8520530462265015, "kl": 1.6379465609788895, "learning_rate": 3.3861624016779865e-05, "loss": -0.0084, "num_tokens": 10577575.0, "reward": -0.36571916937828064, "reward_std": 0.28022339940071106, "rewards/rollout_reward_func/mean": -0.36571916937828064, "rewards/rollout_reward_func/std": 0.3098476529121399, "sampling/importance_sampling_ratio/max": 1.4401140213012695, "sampling/importance_sampling_ratio/mean": 0.9993970394134521, "sampling/importance_sampling_ratio/min": 0.5231577157974243, "sampling/sampling_logp_difference/max": 0.6478723287582397, "sampling/sampling_logp_difference/mean": 0.01716748997569084, "step": 307, "step_time": 26.66528392100372 }, { "clip_ratio/high_max": 0.02933300146833062, "clip_ratio/high_mean": 0.018800569116137922, "clip_ratio/low_mean": 0.008719248115085065, "clip_ratio/low_min": 0.002659574383869767, "clip_ratio/region_mean": 0.027519817231222987, "entropy": 0.2931448705494404, "epoch": 0.00308, "grad_norm": 0.570613443851471, "kl": 1.651346117258072, "learning_rate": 3.365893219582084e-05, "loss": -0.0135, "step": 308, "step_time": 6.156346969999504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1640.21875, "completions/mean_terminated_length": 1640.21875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.3087381776422262, "epoch": 0.00309, "frac_reward_zero_std": 0.0, "grad_norm": 0.8091050386428833, "kl": 1.289789840579033, "learning_rate": 3.3456561614207985e-05, "loss": -0.1749, "num_tokens": 10655613.0, "reward": -0.39627617597579956, "reward_std": 0.2599743604660034, "rewards/rollout_reward_func/mean": -0.39627617597579956, "rewards/rollout_reward_func/std": 0.28550201654434204, "sampling/importance_sampling_ratio/max": 2.4766736030578613, "sampling/importance_sampling_ratio/mean": 1.004058837890625, "sampling/importance_sampling_ratio/min": 0.6814326643943787, "sampling/sampling_logp_difference/max": 0.9069163799285889, "sampling/sampling_logp_difference/mean": 0.019001323729753494, "step": 309, "step_time": 26.245833776001746 }, { "clip_ratio/high_max": 0.020851543871685863, "clip_ratio/high_mean": 0.013660037540830672, "clip_ratio/low_mean": 0.002358490601181984, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016018528142012656, "entropy": 0.3085634112358093, "epoch": 0.0031, "grad_norm": 0.7752407789230347, "kl": 1.295120745897293, "learning_rate": 3.325452213519794e-05, "loss": -0.1829, "step": 310, "step_time": 5.937131855996995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 1642.46875, "completions/mean_terminated_length": 1642.46875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.38769714906811714, "epoch": 0.00311, "frac_reward_zero_std": 0.0, "grad_norm": 1.0822538137435913, "kl": 1.5524880588054657, "learning_rate": 3.305282360590988e-05, "loss": -0.067, "num_tokens": 10733735.0, "reward": -0.3342939615249634, "reward_std": 0.28436195850372314, "rewards/rollout_reward_func/mean": -0.3342939615249634, "rewards/rollout_reward_func/std": 0.3038986623287201, "sampling/importance_sampling_ratio/max": 1.362818717956543, "sampling/importance_sampling_ratio/mean": 0.9993179440498352, "sampling/importance_sampling_ratio/min": 0.3904021084308624, "sampling/sampling_logp_difference/max": 0.9405779838562012, "sampling/sampling_logp_difference/mean": 0.02130117081105709, "step": 311, "step_time": 27.647770136001782 }, { "clip_ratio/high_max": 0.016187283326871693, "clip_ratio/high_mean": 0.011553710559383035, "clip_ratio/low_mean": 0.015858522208873183, "clip_ratio/low_min": 0.012366774724796414, "clip_ratio/region_mean": 0.027412232477217913, "entropy": 0.38218119740486145, "epoch": 0.00312, "grad_norm": 0.6556536555290222, "kl": 1.5894036442041397, "learning_rate": 3.2851475856845574e-05, "loss": -0.0721, "step": 312, "step_time": 5.971561485997881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 1734.875, "completions/mean_terminated_length": 1734.875, "completions/min_length": 1592.0, "completions/min_terminated_length": 1592.0, "entropy": 0.34134603664278984, "epoch": 0.00313, "frac_reward_zero_std": 0.0, "grad_norm": 0.7925176024436951, "kl": 1.379950799047947, "learning_rate": 3.265048870141023e-05, "loss": -0.0374, "num_tokens": 10814846.0, "reward": -0.3326209783554077, "reward_std": 0.22150471806526184, "rewards/rollout_reward_func/mean": -0.3326209783554077, "rewards/rollout_reward_func/std": 0.22830010950565338, "sampling/importance_sampling_ratio/max": 2.7102572917938232, "sampling/importance_sampling_ratio/mean": 1.0019333362579346, "sampling/importance_sampling_ratio/min": 0.43068286776542664, "sampling/sampling_logp_difference/max": 0.9970436096191406, "sampling/sampling_logp_difference/mean": 0.01938091777265072, "step": 313, "step_time": 26.933980166000765 }, { "clip_ratio/high_max": 0.024007610976696014, "clip_ratio/high_mean": 0.014086562674492598, "clip_ratio/low_mean": 0.016809358377940953, "clip_ratio/low_min": 0.004464285913854837, "clip_ratio/region_mean": 0.030895921285264194, "entropy": 0.3383186161518097, "epoch": 0.00314, "grad_norm": 0.48439493775367737, "kl": 1.370563119649887, "learning_rate": 3.2449871935434276e-05, "loss": -0.0444, "step": 314, "step_time": 6.016575620004005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1710.8125, "completions/mean_terminated_length": 1710.8125, "completions/min_length": 1610.0, "completions/min_terminated_length": 1610.0, "entropy": 0.27158660627901554, "epoch": 0.00315, "frac_reward_zero_std": 0.0, "grad_norm": 0.9541974067687988, "kl": 1.2685663625597954, "learning_rate": 3.224963533669585e-05, "loss": 0.1188, "num_tokens": 10895142.0, "reward": -0.26208436489105225, "reward_std": 0.2567344307899475, "rewards/rollout_reward_func/mean": -0.26208436489105225, "rewards/rollout_reward_func/std": 0.2525034546852112, "sampling/importance_sampling_ratio/max": 1.3942749500274658, "sampling/importance_sampling_ratio/mean": 0.9996742606163025, "sampling/importance_sampling_ratio/min": 0.48265936970710754, "sampling/sampling_logp_difference/max": 0.7284440994262695, "sampling/sampling_logp_difference/mean": 0.01853971928358078, "step": 315, "step_time": 27.06007424800009 }, { "clip_ratio/high_max": 0.00920020672492683, "clip_ratio/high_mean": 0.004600103362463415, "clip_ratio/low_mean": 0.016561065684072673, "clip_ratio/low_min": 0.008542737690731883, "clip_ratio/region_mean": 0.021161168930120766, "entropy": 0.2683967351913452, "epoch": 0.00316, "grad_norm": 0.6890423893928528, "kl": 1.3117049634456635, "learning_rate": 3.204978866444427e-05, "loss": 0.1138, "step": 316, "step_time": 6.690224530000705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 1643.625, "completions/mean_terminated_length": 1643.625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.3429113067686558, "epoch": 0.00317, "frac_reward_zero_std": 0.0, "grad_norm": 1.0341111421585083, "kl": 1.276606522500515, "learning_rate": 3.185034165892442e-05, "loss": -0.0961, "num_tokens": 10973296.0, "reward": -0.34782904386520386, "reward_std": 0.28003746271133423, "rewards/rollout_reward_func/mean": -0.34782904386520386, "rewards/rollout_reward_func/std": 0.2775237560272217, "sampling/importance_sampling_ratio/max": 1.498800277709961, "sampling/importance_sampling_ratio/mean": 1.001167893409729, "sampling/importance_sampling_ratio/min": 0.6546292901039124, "sampling/sampling_logp_difference/max": 0.423686146736145, "sampling/sampling_logp_difference/mean": 0.01856786385178566, "step": 317, "step_time": 26.87595820000024 }, { "clip_ratio/high_max": 0.018704255111515522, "clip_ratio/high_mean": 0.01231854542857036, "clip_ratio/low_mean": 0.007076790789142251, "clip_ratio/low_min": 0.001179245300590992, "clip_ratio/region_mean": 0.019395335868466645, "entropy": 0.3396468721330166, "epoch": 0.00318, "grad_norm": 0.8495044708251953, "kl": 1.2902041748166084, "learning_rate": 3.165130404090196e-05, "loss": -0.1044, "step": 318, "step_time": 5.998267991002649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 1692.65625, "completions/mean_terminated_length": 1692.65625, "completions/min_length": 1124.0, "completions/min_terminated_length": 1124.0, "entropy": 0.3386183138936758, "epoch": 0.00319, "frac_reward_zero_std": 0.0, "grad_norm": 1.1155974864959717, "kl": 1.578216813504696, "learning_rate": 3.145268551118962e-05, "loss": 0.0773, "num_tokens": 11053003.0, "reward": -0.3649328947067261, "reward_std": 0.29602405428886414, "rewards/rollout_reward_func/mean": -0.3649328947067261, "rewards/rollout_reward_func/std": 0.2831602394580841, "sampling/importance_sampling_ratio/max": 1.5509647130966187, "sampling/importance_sampling_ratio/mean": 1.0031399726867676, "sampling/importance_sampling_ratio/min": 0.6934173107147217, "sampling/sampling_logp_difference/max": 0.4388771057128906, "sampling/sampling_logp_difference/mean": 0.0208877082914114, "step": 319, "step_time": 26.921891862995835 }, { "clip_ratio/high_max": 0.013015943230129778, "clip_ratio/high_mean": 0.008928902505431324, "clip_ratio/low_mean": 0.0125357203069143, "clip_ratio/low_min": 0.0038928264984861016, "clip_ratio/region_mean": 0.021464623045176268, "entropy": 0.33680661395192146, "epoch": 0.0032, "grad_norm": 0.7507835030555725, "kl": 1.6232310682535172, "learning_rate": 3.125449575017427e-05, "loss": 0.0709, "step": 320, "step_time": 7.32946445199741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 1687.53125, "completions/mean_terminated_length": 1687.53125, "completions/min_length": 1303.0, "completions/min_terminated_length": 1303.0, "entropy": 0.3108698297291994, "epoch": 0.00321, "frac_reward_zero_std": 0.0, "grad_norm": 0.9776124358177185, "kl": 1.5578304529190063, "learning_rate": 3.105674441734531e-05, "loss": 0.0107, "num_tokens": 11132552.0, "reward": -0.35697421431541443, "reward_std": 0.2883848547935486, "rewards/rollout_reward_func/mean": -0.35697421431541443, "rewards/rollout_reward_func/std": 0.2915443181991577, "sampling/importance_sampling_ratio/max": 1.4338070154190063, "sampling/importance_sampling_ratio/mean": 0.9985977411270142, "sampling/importance_sampling_ratio/min": 0.2068168967962265, "sampling/sampling_logp_difference/max": 1.5759214162826538, "sampling/sampling_logp_difference/mean": 0.01987595111131668, "step": 321, "step_time": 27.151746367997475 }, { "clip_ratio/high_max": 0.024654722306877375, "clip_ratio/high_mean": 0.015175403910689056, "clip_ratio/low_mean": 0.017317220917902887, "clip_ratio/low_min": 0.009938123519532382, "clip_ratio/region_mean": 0.03249262447934598, "entropy": 0.30716669745743275, "epoch": 0.00322, "grad_norm": 0.8920367360115051, "kl": 1.6015677899122238, "learning_rate": 3.08594411508237e-05, "loss": 0.0071, "step": 322, "step_time": 5.955208508998112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 1665.71875, "completions/mean_terminated_length": 1665.71875, "completions/min_length": 1564.0, "completions/min_terminated_length": 1564.0, "entropy": 0.40772537514567375, "epoch": 0.00323, "frac_reward_zero_std": 0.0, "grad_norm": 1.1897844076156616, "kl": 1.4434964507818222, "learning_rate": 3.06625955668923e-05, "loss": 0.0246, "num_tokens": 11211358.0, "reward": -0.22678738832473755, "reward_std": 0.29577308893203735, "rewards/rollout_reward_func/mean": -0.22678738832473755, "rewards/rollout_reward_func/std": 0.3215741813182831, "sampling/importance_sampling_ratio/max": 1.3528801202774048, "sampling/importance_sampling_ratio/mean": 0.9989253282546997, "sampling/importance_sampling_ratio/min": 0.5580706596374512, "sampling/sampling_logp_difference/max": 0.5832695960998535, "sampling/sampling_logp_difference/mean": 0.019522234797477722, "step": 323, "step_time": 26.80844596300085 }, { "clip_ratio/high_max": 0.010225921869277954, "clip_ratio/high_mean": 0.00661898497492075, "clip_ratio/low_mean": 0.01250165730016306, "clip_ratio/low_min": 0.004829074838198721, "clip_ratio/region_mean": 0.01912064221687615, "entropy": 0.40133029222488403, "epoch": 0.00324, "grad_norm": 0.8774245381355286, "kl": 1.4515839219093323, "learning_rate": 3.046621725952714e-05, "loss": 0.0154, "step": 324, "step_time": 5.901338416000726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1741.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 1666.875, "completions/mean_terminated_length": 1666.875, "completions/min_length": 1580.0, "completions/min_terminated_length": 1580.0, "entropy": 0.29187275283038616, "epoch": 0.00325, "frac_reward_zero_std": 0.0, "grad_norm": 1.299919605255127, "kl": 1.3579960465431213, "learning_rate": 3.0270315799929914e-05, "loss": -0.1142, "num_tokens": 11290153.0, "reward": -0.34980133175849915, "reward_std": 0.22945016622543335, "rewards/rollout_reward_func/mean": -0.34980133175849915, "rewards/rollout_reward_func/std": 0.2464682161808014, "sampling/importance_sampling_ratio/max": 1.4272664785385132, "sampling/importance_sampling_ratio/mean": 1.002694845199585, "sampling/importance_sampling_ratio/min": 0.670484185218811, "sampling/sampling_logp_difference/max": 0.39975517988204956, "sampling/sampling_logp_difference/mean": 0.01833367720246315, "step": 325, "step_time": 27.694059677996847 }, { "clip_ratio/high_max": 0.021095296135172248, "clip_ratio/high_mean": 0.0149164839531295, "clip_ratio/low_mean": 0.01602538232691586, "clip_ratio/low_min": 0.005463386652991176, "clip_ratio/region_mean": 0.030941865872591734, "entropy": 0.2900099717080593, "epoch": 0.00326, "grad_norm": 0.8346182703971863, "kl": 1.4127541407942772, "learning_rate": 3.0074900736061354e-05, "loss": -0.1236, "step": 326, "step_time": 5.754384321000543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1861.0, "completions/max_terminated_length": 1861.0, "completions/mean_length": 1699.09375, "completions/mean_terminated_length": 1699.09375, "completions/min_length": 1589.0, "completions/min_terminated_length": 1589.0, "entropy": 0.4950914513319731, "epoch": 0.00327, "frac_reward_zero_std": 0.0, "grad_norm": 0.9218499660491943, "kl": 1.1431479379534721, "learning_rate": 2.9879981592176e-05, "loss": 0.0229, "num_tokens": 11370047.0, "reward": -0.19750934839248657, "reward_std": 0.22417429089546204, "rewards/rollout_reward_func/mean": -0.19750934839248657, "rewards/rollout_reward_func/std": 0.2402225136756897, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 0.9998868703842163, "sampling/importance_sampling_ratio/min": 0.02070748247206211, "sampling/sampling_logp_difference/max": 3.877260208129883, "sampling/sampling_logp_difference/mean": 0.035609886050224304, "step": 327, "step_time": 27.092650211003274 }, { "clip_ratio/high_max": 0.02039982727728784, "clip_ratio/high_mean": 0.01019991363864392, "clip_ratio/low_mean": 0.01502365042688325, "clip_ratio/low_min": 0.0033984502078965306, "clip_ratio/region_mean": 0.025223563774488866, "entropy": 0.49197234865278006, "epoch": 0.00328, "grad_norm": 0.8338290452957153, "kl": 1.1596594825387, "learning_rate": 2.9685567868357922e-05, "loss": 0.0148, "step": 328, "step_time": 6.040699450997636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1708.78125, "completions/mean_terminated_length": 1708.78125, "completions/min_length": 1625.0, "completions/min_terminated_length": 1625.0, "entropy": 0.29792289063334465, "epoch": 0.00329, "frac_reward_zero_std": 0.0, "grad_norm": 1.0157338380813599, "kl": 1.4072334170341492, "learning_rate": 2.9491669040057753e-05, "loss": -0.0235, "num_tokens": 11450267.0, "reward": -0.31510427594184875, "reward_std": 0.24744383990764618, "rewards/rollout_reward_func/mean": -0.31510427594184875, "rewards/rollout_reward_func/std": 0.2439759522676468, "sampling/importance_sampling_ratio/max": 2.541884422302246, "sampling/importance_sampling_ratio/mean": 1.0008798837661743, "sampling/importance_sampling_ratio/min": 0.6031873822212219, "sampling/sampling_logp_difference/max": 0.9329056739807129, "sampling/sampling_logp_difference/mean": 0.020144585520029068, "step": 329, "step_time": 26.430551294997713 }, { "clip_ratio/high_max": 0.012918181018903852, "clip_ratio/high_mean": 0.008254443062469363, "clip_ratio/low_mean": 0.009505538793746382, "clip_ratio/low_min": 0.0030982252210378647, "clip_ratio/region_mean": 0.017759981739800423, "entropy": 0.29644286073744297, "epoch": 0.0033, "grad_norm": 0.908355712890625, "kl": 1.4785271883010864, "learning_rate": 2.9298294557630826e-05, "loss": -0.0288, "step": 330, "step_time": 7.165513138003007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1937.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 1735.21875, "completions/mean_terminated_length": 1735.21875, "completions/min_length": 1527.0, "completions/min_terminated_length": 1527.0, "entropy": 0.2993805855512619, "epoch": 0.00331, "frac_reward_zero_std": 0.0, "grad_norm": 1.1085600852966309, "kl": 1.483883611857891, "learning_rate": 2.9105453845876614e-05, "loss": 0.0295, "num_tokens": 11531357.0, "reward": -0.33016860485076904, "reward_std": 0.33701637387275696, "rewards/rollout_reward_func/mean": -0.33016860485076904, "rewards/rollout_reward_func/std": 0.32859018445014954, "sampling/importance_sampling_ratio/max": 1.3856598138809204, "sampling/importance_sampling_ratio/mean": 0.9990066289901733, "sampling/importance_sampling_ratio/min": 0.5948973894119263, "sampling/sampling_logp_difference/max": 0.5193663835525513, "sampling/sampling_logp_difference/mean": 0.020575789734721184, "step": 331, "step_time": 27.7153690239993 }, { "clip_ratio/high_max": 0.03454454580787569, "clip_ratio/high_mean": 0.02203672128962353, "clip_ratio/low_mean": 0.009176777850370854, "clip_ratio/low_min": 0.004119053948670626, "clip_ratio/region_mean": 0.031213499372825027, "entropy": 0.3034119922667742, "epoch": 0.00332, "grad_norm": 0.7328145503997803, "kl": 1.4621518477797508, "learning_rate": 2.8913156303579345e-05, "loss": 0.0196, "step": 332, "step_time": 6.213777661998392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1654.1875, "completions/mean_terminated_length": 1654.1875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "entropy": 0.31864678859710693, "epoch": 0.00333, "frac_reward_zero_std": 0.0, "grad_norm": 0.7948474884033203, "kl": 1.3238542303442955, "learning_rate": 2.8721411303049936e-05, "loss": -0.1522, "num_tokens": 11609799.0, "reward": -0.36039847135543823, "reward_std": 0.26460525393486023, "rewards/rollout_reward_func/mean": -0.36039847135543823, "rewards/rollout_reward_func/std": 0.2726755738258362, "sampling/importance_sampling_ratio/max": 1.4634615182876587, "sampling/importance_sampling_ratio/mean": 1.0019690990447998, "sampling/importance_sampling_ratio/min": 0.7428172826766968, "sampling/sampling_logp_difference/max": 0.38080453872680664, "sampling/sampling_logp_difference/mean": 0.017515085637569427, "step": 333, "step_time": 26.226210351998816 }, { "clip_ratio/high_max": 0.015252797165885568, "clip_ratio/high_mean": 0.009183237445540726, "clip_ratio/low_mean": 0.017852610093541443, "clip_ratio/low_min": 0.008333036908879876, "clip_ratio/region_mean": 0.02703584684059024, "entropy": 0.3149984981864691, "epoch": 0.00334, "grad_norm": 0.6729979515075684, "kl": 1.3519223481416702, "learning_rate": 2.8530228189669203e-05, "loss": -0.1572, "step": 334, "step_time": 6.500495514002978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1902.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 1725.5625, "completions/mean_terminated_length": 1725.5625, "completions/min_length": 1333.0, "completions/min_terminated_length": 1333.0, "entropy": 0.26893332228064537, "epoch": 0.00335, "frac_reward_zero_std": 0.0, "grad_norm": 1.2063705921173096, "kl": 2.1284487694501877, "learning_rate": 2.8339616281432378e-05, "loss": -0.0799, "num_tokens": 11690615.0, "reward": -0.33747610449790955, "reward_std": 0.23020978271961212, "rewards/rollout_reward_func/mean": -0.33747610449790955, "rewards/rollout_reward_func/std": 0.2628439664840698, "sampling/importance_sampling_ratio/max": 1.3662012815475464, "sampling/importance_sampling_ratio/mean": 0.9992856979370117, "sampling/importance_sampling_ratio/min": 0.5883904695510864, "sampling/sampling_logp_difference/max": 0.5303645730018616, "sampling/sampling_logp_difference/mean": 0.0165717750787735, "step": 335, "step_time": 26.78048735199991 }, { "clip_ratio/high_max": 0.017331918817944825, "clip_ratio/high_mean": 0.009303714556153864, "clip_ratio/low_mean": 0.008379408915061504, "clip_ratio/low_min": 0.001623376621864736, "clip_ratio/region_mean": 0.017683123471215367, "entropy": 0.271977923810482, "epoch": 0.00336, "grad_norm": 0.8048015832901001, "kl": 1.8007766753435135, "learning_rate": 2.8149584868494948e-05, "loss": -0.0863, "step": 336, "step_time": 6.12497890800114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 1676.78125, "completions/mean_terminated_length": 1676.78125, "completions/min_length": 1184.0, "completions/min_terminated_length": 1184.0, "entropy": 0.23397336341440678, "epoch": 0.00337, "frac_reward_zero_std": 0.0, "grad_norm": 0.881834864616394, "kl": 1.4368260987102985, "learning_rate": 2.796014321271988e-05, "loss": 0.0193, "num_tokens": 11769792.0, "reward": -0.1745065301656723, "reward_std": 0.25410425662994385, "rewards/rollout_reward_func/mean": -0.1745065301656723, "rewards/rollout_reward_func/std": 0.28326255083084106, "sampling/importance_sampling_ratio/max": 1.7290990352630615, "sampling/importance_sampling_ratio/mean": 0.9988943338394165, "sampling/importance_sampling_ratio/min": 0.4615992605686188, "sampling/sampling_logp_difference/max": 0.7730581760406494, "sampling/sampling_logp_difference/mean": 0.01699381321668625, "step": 337, "step_time": 26.441225729002326 }, { "clip_ratio/high_max": 0.012726471060886979, "clip_ratio/high_mean": 0.007986612035892904, "clip_ratio/low_mean": 0.013868798443581909, "clip_ratio/low_min": 0.008099310100078583, "clip_ratio/region_mean": 0.021855410421267152, "entropy": 0.23370393924415112, "epoch": 0.00338, "grad_norm": 0.6997842192649841, "kl": 1.4461667761206627, "learning_rate": 2.7771300547226225e-05, "loss": 0.015, "step": 338, "step_time": 5.951883257999725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1896.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 1704.03125, "completions/mean_terminated_length": 1707.1934814453125, "completions/min_length": 1565.0, "completions/min_terminated_length": 1565.0, "entropy": 0.2623404338955879, "epoch": 0.00339, "frac_reward_zero_std": 0.0, "grad_norm": 1.3844835758209229, "kl": 1.523626759648323, "learning_rate": 2.7583066075939086e-05, "loss": 0.1158, "num_tokens": 11849858.0, "reward": -0.2860211730003357, "reward_std": 0.25058311223983765, "rewards/rollout_reward_func/mean": -0.2860211730003357, "rewards/rollout_reward_func/std": 0.2860009968280792, "sampling/importance_sampling_ratio/max": 2.5010273456573486, "sampling/importance_sampling_ratio/mean": 0.9996538162231445, "sampling/importance_sampling_ratio/min": 0.6194818019866943, "sampling/sampling_logp_difference/max": 0.9167015552520752, "sampling/sampling_logp_difference/mean": 0.01685354672372341, "step": 339, "step_time": 27.17739467300089 }, { "clip_ratio/high_max": 0.021978980395942926, "clip_ratio/high_mean": 0.010989490197971463, "clip_ratio/low_mean": 0.008551394334062934, "clip_ratio/low_min": 0.0018382353009656072, "clip_ratio/region_mean": 0.019540884415619075, "entropy": 0.26403761096298695, "epoch": 0.0034, "grad_norm": 0.6471749544143677, "kl": 1.4963716492056847, "learning_rate": 2.7395448973141018e-05, "loss": 0.1105, "step": 340, "step_time": 6.750552448000235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 1713.59375, "completions/mean_terminated_length": 1713.59375, "completions/min_length": 1607.0, "completions/min_terminated_length": 1607.0, "entropy": 0.15758358500897884, "epoch": 0.00341, "frac_reward_zero_std": 0.0, "grad_norm": 0.9701387882232666, "kl": 1.1048352047801018, "learning_rate": 2.720845838302494e-05, "loss": -0.0541, "num_tokens": 11930268.0, "reward": -0.3289961516857147, "reward_std": 0.23429100215435028, "rewards/rollout_reward_func/mean": -0.3289961516857147, "rewards/rollout_reward_func/std": 0.2324601113796234, "sampling/importance_sampling_ratio/max": 1.4431688785552979, "sampling/importance_sampling_ratio/mean": 1.0003907680511475, "sampling/importance_sampling_ratio/min": 0.007138380780816078, "sampling/sampling_logp_difference/max": 4.942269325256348, "sampling/sampling_logp_difference/mean": 0.0180658046156168, "step": 341, "step_time": 27.09933693399944 }, { "clip_ratio/high_max": 0.0072159008122980595, "clip_ratio/high_mean": 0.004360962426289916, "clip_ratio/low_mean": 0.005079821217805147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009440783644095063, "entropy": 0.15963860880583525, "epoch": 0.00342, "grad_norm": 0.5201312899589539, "kl": 1.0698599964380264, "learning_rate": 2.7022103419248412e-05, "loss": -0.0576, "step": 342, "step_time": 5.9745114849974925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 1664.40625, "completions/mean_terminated_length": 1664.40625, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "entropy": 0.1991294613108039, "epoch": 0.00343, "frac_reward_zero_std": 0.0, "grad_norm": 0.8366519808769226, "kl": 1.1888859868049622, "learning_rate": 2.6836393164489438e-05, "loss": -0.05, "num_tokens": 12009056.0, "reward": -0.30244266986846924, "reward_std": 0.21356219053268433, "rewards/rollout_reward_func/mean": -0.30244266986846924, "rewards/rollout_reward_func/std": 0.2198965698480606, "sampling/importance_sampling_ratio/max": 2.086890935897827, "sampling/importance_sampling_ratio/mean": 1.00006103515625, "sampling/importance_sampling_ratio/min": 0.5276151299476624, "sampling/sampling_logp_difference/max": 0.7356753349304199, "sampling/sampling_logp_difference/mean": 0.015745539218187332, "step": 343, "step_time": 26.57027862799987 }, { "clip_ratio/high_max": 0.016310590552166104, "clip_ratio/high_mean": 0.009643390541896224, "clip_ratio/low_mean": 0.004326542606577277, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013969933148473501, "entropy": 0.19834863115102053, "epoch": 0.00344, "grad_norm": 0.6736382842063904, "kl": 1.1630604416131973, "learning_rate": 2.6651336670003853e-05, "loss": -0.0516, "step": 344, "step_time": 6.332988802998443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 1751.59375, "completions/mean_terminated_length": 1751.59375, "completions/min_length": 1606.0, "completions/min_terminated_length": 1606.0, "entropy": 0.2567560439929366, "epoch": 0.00345, "frac_reward_zero_std": 0.0, "grad_norm": 0.9481948614120483, "kl": 1.0666305609047413, "learning_rate": 2.6466942955184114e-05, "loss": 0.048, "num_tokens": 12090720.0, "reward": -0.3440430760383606, "reward_std": 0.22682608664035797, "rewards/rollout_reward_func/mean": -0.3440430760383606, "rewards/rollout_reward_func/std": 0.24248455464839935, "sampling/importance_sampling_ratio/max": 1.5949938297271729, "sampling/importance_sampling_ratio/mean": 0.9964559078216553, "sampling/importance_sampling_ratio/min": 0.2537042200565338, "sampling/sampling_logp_difference/max": 1.3715860843658447, "sampling/sampling_logp_difference/mean": 0.020231425762176514, "step": 345, "step_time": 27.470160887996826 }, { "clip_ratio/high_max": 0.0155776331666857, "clip_ratio/high_mean": 0.008499043993651867, "clip_ratio/low_mean": 0.005448155337944627, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013947199215181172, "entropy": 0.2604408422484994, "epoch": 0.00346, "grad_norm": 0.758798360824585, "kl": 1.06202382594347, "learning_rate": 2.6283221007119753e-05, "loss": 0.0412, "step": 346, "step_time": 6.113599244999932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1918.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 1729.21875, "completions/mean_terminated_length": 1729.21875, "completions/min_length": 1063.0, "completions/min_terminated_length": 1063.0, "entropy": 0.2577835116535425, "epoch": 0.00347, "frac_reward_zero_std": 0.0, "grad_norm": 1.0687464475631714, "kl": 1.3719854429364204, "learning_rate": 2.6100179780159304e-05, "loss": -0.0701, "num_tokens": 12171676.0, "reward": -0.34831663966178894, "reward_std": 0.25279873609542847, "rewards/rollout_reward_func/mean": -0.34831663966178894, "rewards/rollout_reward_func/std": 0.2683839797973633, "sampling/importance_sampling_ratio/max": 1.9168801307678223, "sampling/importance_sampling_ratio/mean": 1.0017318725585938, "sampling/importance_sampling_ratio/min": 0.6751007437705994, "sampling/sampling_logp_difference/max": 0.6506989002227783, "sampling/sampling_logp_difference/mean": 0.016634851694107056, "step": 347, "step_time": 26.737927673999366 }, { "clip_ratio/high_max": 0.02911564928945154, "clip_ratio/high_mean": 0.017649755987804383, "clip_ratio/low_mean": 0.014982548134867102, "clip_ratio/low_min": 0.0036764706019312143, "clip_ratio/region_mean": 0.032632303540594876, "entropy": 0.2622975129634142, "epoch": 0.00348, "grad_norm": 0.720947802066803, "kl": 1.3556138575077057, "learning_rate": 2.591782819547396e-05, "loss": -0.0781, "step": 348, "step_time": 6.0985770849983965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1670.8125, "completions/mean_terminated_length": 1674.322509765625, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "entropy": 0.3010242059826851, "epoch": 0.00349, "frac_reward_zero_std": 0.0, "grad_norm": 0.7387946844100952, "kl": 1.2518930435180664, "learning_rate": 2.5736175140622693e-05, "loss": -0.0789, "num_tokens": 12250703.0, "reward": -0.2906644344329834, "reward_std": 0.3423389792442322, "rewards/rollout_reward_func/mean": -0.2906644344329834, "rewards/rollout_reward_func/std": 0.36451083421707153, "sampling/importance_sampling_ratio/max": 1.7547730207443237, "sampling/importance_sampling_ratio/mean": 0.9989039897918701, "sampling/importance_sampling_ratio/min": 0.6408941745758057, "sampling/sampling_logp_difference/max": 0.5623395442962646, "sampling/sampling_logp_difference/mean": 0.015134643763303757, "step": 349, "step_time": 27.62179246699816 }, { "clip_ratio/high_max": 0.00930511811748147, "clip_ratio/high_mean": 0.005497153615579009, "clip_ratio/low_mean": 0.014058202330488712, "clip_ratio/low_min": 0.008585532661527395, "clip_ratio/region_mean": 0.01955535588786006, "entropy": 0.30348924174904823, "epoch": 0.0035, "grad_norm": 0.5685436129570007, "kl": 1.2226365208625793, "learning_rate": 2.5555229469119133e-05, "loss": -0.0856, "step": 350, "step_time": 5.979286975998548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1666.875, "completions/mean_terminated_length": 1666.875, "completions/min_length": 1562.0, "completions/min_terminated_length": 1562.0, "entropy": 0.1848358828574419, "epoch": 0.00351, "frac_reward_zero_std": 0.0, "grad_norm": 0.7052991390228271, "kl": 1.079721249639988, "learning_rate": 2.5375e-05, "loss": 0.011, "num_tokens": 12329558.0, "reward": -0.22468164563179016, "reward_std": 0.2239469289779663, "rewards/rollout_reward_func/mean": -0.22468164563179016, "rewards/rollout_reward_func/std": 0.24883177876472473, "sampling/importance_sampling_ratio/max": 1.5240743160247803, "sampling/importance_sampling_ratio/mean": 0.9987475872039795, "sampling/importance_sampling_ratio/min": 0.6145860552787781, "sampling/sampling_logp_difference/max": 0.48680639266967773, "sampling/sampling_logp_difference/mean": 0.010760078206658363, "step": 351, "step_time": 26.22100326400141 }, { "clip_ratio/high_max": 0.013849671697244048, "clip_ratio/high_mean": 0.009008169290609658, "clip_ratio/low_mean": 0.00872205087216571, "clip_ratio/low_min": 0.004716981202363968, "clip_ratio/region_mean": 0.017730220104567707, "entropy": 0.18952921591699123, "epoch": 0.00352, "grad_norm": 0.4822404682636261, "kl": 1.069474183022976, "learning_rate": 2.5195495517395377e-05, "loss": 0.0078, "step": 352, "step_time": 5.816125348999776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 1598.90625, "completions/mean_terminated_length": 1598.90625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.26915660314261913, "epoch": 0.00353, "frac_reward_zero_std": 0.0, "grad_norm": 0.772146999835968, "kl": 0.9939440004527569, "learning_rate": 2.5016724770100502e-05, "loss": -0.1988, "num_tokens": 12406231.0, "reward": -0.26368623971939087, "reward_std": 0.2857432961463928, "rewards/rollout_reward_func/mean": -0.26368623971939087, "rewards/rollout_reward_func/std": 0.3011232316493988, "sampling/importance_sampling_ratio/max": 1.4750020503997803, "sampling/importance_sampling_ratio/mean": 1.0004088878631592, "sampling/importance_sampling_ratio/min": 0.7484446167945862, "sampling/sampling_logp_difference/max": 0.38865935802459717, "sampling/sampling_logp_difference/mean": 0.015578299760818481, "step": 353, "step_time": 26.09201659500286 }, { "clip_ratio/high_max": 0.028282994055189192, "clip_ratio/high_mean": 0.014974830497521907, "clip_ratio/low_mean": 0.008509852224960923, "clip_ratio/low_min": 0.0016891892300918698, "clip_ratio/region_mean": 0.023484682256821543, "entropy": 0.2719515264034271, "epoch": 0.00354, "grad_norm": 0.5021503567695618, "kl": 0.9869056902825832, "learning_rate": 2.4838696471149377e-05, "loss": -0.2058, "step": 354, "step_time": 6.573818677998133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 1686.03125, "completions/mean_terminated_length": 1686.03125, "completions/min_length": 1582.0, "completions/min_terminated_length": 1582.0, "entropy": 0.2146351533010602, "epoch": 0.00355, "frac_reward_zero_std": 0.0, "grad_norm": 0.9882098436355591, "kl": 0.7952211499214172, "learning_rate": 2.466141929739015e-05, "loss": 0.111, "num_tokens": 12485723.0, "reward": -0.25352942943573, "reward_std": 0.1724730283021927, "rewards/rollout_reward_func/mean": -0.25352942943573, "rewards/rollout_reward_func/std": 0.19391050934791565, "sampling/importance_sampling_ratio/max": 1.725264072418213, "sampling/importance_sampling_ratio/mean": 0.9988421201705933, "sampling/importance_sampling_ratio/min": 0.3576260507106781, "sampling/sampling_logp_difference/max": 1.0282673835754395, "sampling/sampling_logp_difference/mean": 0.01605220139026642, "step": 355, "step_time": 26.476247668004362 }, { "clip_ratio/high_max": 0.017918276600539684, "clip_ratio/high_mean": 0.009920676820911467, "clip_ratio/low_mean": 0.010778589465189725, "clip_ratio/low_min": 0.0013157895300537348, "clip_ratio/region_mean": 0.020699266577139497, "entropy": 0.21481928694993258, "epoch": 0.00356, "grad_norm": 0.6238808631896973, "kl": 0.8028969801962376, "learning_rate": 2.4484901889062156e-05, "loss": 0.1054, "step": 356, "step_time": 5.828814825999871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1673.0625, "completions/mean_terminated_length": 1673.0625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.23141017369925976, "epoch": 0.00357, "frac_reward_zero_std": 0.0, "grad_norm": 0.7177031636238098, "kl": 0.73196891695261, "learning_rate": 2.430915284937484e-05, "loss": 0.0014, "num_tokens": 12564810.0, "reward": -0.3322100043296814, "reward_std": 0.28593695163726807, "rewards/rollout_reward_func/mean": -0.3322100043296814, "rewards/rollout_reward_func/std": 0.2867739498615265, "sampling/importance_sampling_ratio/max": 1.8287023305892944, "sampling/importance_sampling_ratio/mean": 0.9997081756591797, "sampling/importance_sampling_ratio/min": 0.6611297130584717, "sampling/sampling_logp_difference/max": 0.6036065816879272, "sampling/sampling_logp_difference/mean": 0.01489335112273693, "step": 357, "step_time": 26.050706370997432 }, { "clip_ratio/high_max": 0.006536889239214361, "clip_ratio/high_mean": 0.003962889139074832, "clip_ratio/low_mean": 0.004087584791705012, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008050473930779845, "entropy": 0.23025601729750633, "epoch": 0.00358, "grad_norm": 0.5855690836906433, "kl": 0.7360790967941284, "learning_rate": 2.413418074408849e-05, "loss": -0.0016, "step": 358, "step_time": 6.44407608900292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 1703.25, "completions/mean_terminated_length": 1703.25, "completions/min_length": 1619.0, "completions/min_terminated_length": 1619.0, "entropy": 0.3312019780278206, "epoch": 0.00359, "frac_reward_zero_std": 0.0, "grad_norm": 1.1608937978744507, "kl": 0.9618125036358833, "learning_rate": 2.395999410109663e-05, "loss": -0.1778, "num_tokens": 12644845.0, "reward": -0.28249186277389526, "reward_std": 0.22331927716732025, "rewards/rollout_reward_func/mean": -0.28249186277389526, "rewards/rollout_reward_func/std": 0.2303655743598938, "sampling/importance_sampling_ratio/max": 1.409551739692688, "sampling/importance_sampling_ratio/mean": 0.9996355772018433, "sampling/importance_sampling_ratio/min": 0.5898933410644531, "sampling/sampling_logp_difference/max": 0.5278135538101196, "sampling/sampling_logp_difference/mean": 0.018575627356767654, "step": 359, "step_time": 27.478257644001133 }, { "clip_ratio/high_max": 0.007761538028717041, "clip_ratio/high_mean": 0.004865020979195833, "clip_ratio/low_mean": 0.010712303454056382, "clip_ratio/low_min": 0.0029761905316263437, "clip_ratio/region_mean": 0.015577324316836894, "entropy": 0.3299320787191391, "epoch": 0.0036, "grad_norm": 0.7858768701553345, "kl": 0.9826586246490479, "learning_rate": 2.3786601410010516e-05, "loss": -0.1837, "step": 360, "step_time": 5.884895023002173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2154.0, "completions/max_terminated_length": 2154.0, "completions/mean_length": 1961.375, "completions/mean_terminated_length": 1961.258056640625, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.24011485278606415, "epoch": 0.00361, "frac_reward_zero_std": 0.0, "grad_norm": 0.7515261769294739, "kl": 0.879942812025547, "learning_rate": 2.3614011121745284e-05, "loss": 0.0257, "num_tokens": 12733186.0, "reward": -0.3608664274215698, "reward_std": 0.174894779920578, "rewards/rollout_reward_func/mean": -0.3608664274215698, "rewards/rollout_reward_func/std": 0.17625097930431366, "sampling/importance_sampling_ratio/max": 1.4044677019119263, "sampling/importance_sampling_ratio/mean": 0.9992481470108032, "sampling/importance_sampling_ratio/min": 0.6532020568847656, "sampling/sampling_logp_difference/max": 0.42586880922317505, "sampling/sampling_logp_difference/mean": 0.01370285265147686, "step": 361, "step_time": 28.21206591599548 }, { "clip_ratio/high_max": 0.017405855818651617, "clip_ratio/high_mean": 0.0091339623904787, "clip_ratio/low_mean": 0.00593476538779214, "clip_ratio/low_min": 0.0013297871919348836, "clip_ratio/region_mean": 0.01506872772006318, "entropy": 0.24226810224354267, "epoch": 0.00362, "grad_norm": 0.5461111664772034, "kl": 0.8719981126487255, "learning_rate": 2.34422316481081e-05, "loss": 0.0192, "step": 362, "step_time": 6.704242689000239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2095.0, "completions/max_terminated_length": 2095.0, "completions/mean_length": 1877.28125, "completions/mean_terminated_length": 1869.7000732421875, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "entropy": 0.23397378344088793, "epoch": 0.00363, "frac_reward_zero_std": 0.0, "grad_norm": 1.0860110521316528, "kl": 0.7182975336909294, "learning_rate": 2.3271271361388158e-05, "loss": -0.0156, "num_tokens": 12818754.0, "reward": -0.3104773759841919, "reward_std": 0.2575171887874603, "rewards/rollout_reward_func/mean": -0.3104773759841919, "rewards/rollout_reward_func/std": 0.2803461253643036, "sampling/importance_sampling_ratio/max": 1.5360424518585205, "sampling/importance_sampling_ratio/mean": 1.0005502700805664, "sampling/importance_sampling_ratio/min": 0.5796602368354797, "sampling/sampling_logp_difference/max": 0.5453131198883057, "sampling/sampling_logp_difference/mean": 0.013212243095040321, "step": 363, "step_time": 29.268875911002397 }, { "clip_ratio/high_max": 0.02128656581044197, "clip_ratio/high_mean": 0.0117213778430596, "clip_ratio/low_mean": 0.005294472008245066, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017015849938616157, "entropy": 0.23606454115360975, "epoch": 0.00364, "grad_norm": 0.6945830583572388, "kl": 0.6941464021801949, "learning_rate": 2.3101138593948653e-05, "loss": -0.0219, "step": 364, "step_time": 7.289320243000475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2217.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 1989.28125, "completions/mean_terminated_length": 1988.9031982421875, "completions/min_length": 1454.0, "completions/min_terminated_length": 1454.0, "entropy": 0.16526137571781874, "epoch": 0.00365, "frac_reward_zero_std": 0.0, "grad_norm": 0.577035665512085, "kl": 0.5813510529696941, "learning_rate": 2.293184163782064e-05, "loss": -0.0287, "num_tokens": 12908008.0, "reward": -0.23070679605007172, "reward_std": 0.29629752039909363, "rewards/rollout_reward_func/mean": -0.23070679605007172, "rewards/rollout_reward_func/std": 0.33452990651130676, "sampling/importance_sampling_ratio/max": 1.5021698474884033, "sampling/importance_sampling_ratio/mean": 0.999884307384491, "sampling/importance_sampling_ratio/min": 0.7338963150978088, "sampling/sampling_logp_difference/max": 0.40691065788269043, "sampling/sampling_logp_difference/mean": 0.010209882631897926, "step": 365, "step_time": 28.546626266001113 }, { "clip_ratio/high_max": 0.017107546678744256, "clip_ratio/high_mean": 0.008553773339372128, "clip_ratio/low_mean": 0.003989361575804651, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0125431350315921, "entropy": 0.16368697676807642, "epoch": 0.00366, "grad_norm": 0.3467119634151459, "kl": 0.5774626731872559, "learning_rate": 2.2763388744298923e-05, "loss": -0.0315, "step": 366, "step_time": 6.761899004997758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2155.0, "completions/max_terminated_length": 2155.0, "completions/mean_length": 1887.6875, "completions/mean_terminated_length": 1880.838623046875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.19971368089318275, "epoch": 0.00367, "frac_reward_zero_std": 0.0, "grad_norm": 0.6819877624511719, "kl": 0.7875267565250397, "learning_rate": 2.2595788123539883e-05, "loss": 0.0067, "num_tokens": 12993977.0, "reward": -0.27859097719192505, "reward_std": 0.3273759186267853, "rewards/rollout_reward_func/mean": -0.27859097719192505, "rewards/rollout_reward_func/std": 0.3310786485671997, "sampling/importance_sampling_ratio/max": 1.2755916118621826, "sampling/importance_sampling_ratio/mean": 1.0011935234069824, "sampling/importance_sampling_ratio/min": 0.6515567898750305, "sampling/sampling_logp_difference/max": 0.4283907413482666, "sampling/sampling_logp_difference/mean": 0.010629373602569103, "step": 367, "step_time": 27.10269585700189 }, { "clip_ratio/high_max": 0.008924625697545707, "clip_ratio/high_mean": 0.0063078743405640125, "clip_ratio/low_mean": 0.004137735464610159, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010445609863381833, "entropy": 0.19917818810790777, "epoch": 0.00368, "grad_norm": 0.41537725925445557, "kl": 0.7857421673834324, "learning_rate": 2.2429047944161324e-05, "loss": 0.0037, "step": 368, "step_time": 7.940460645000712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2069.0, "completions/max_terminated_length": 2069.0, "completions/mean_length": 1913.875, "completions/mean_terminated_length": 1913.875, "completions/min_length": 1802.0, "completions/min_terminated_length": 1802.0, "entropy": 0.21539711952209473, "epoch": 0.00369, "frac_reward_zero_std": 0.0, "grad_norm": 0.8382825255393982, "kl": 0.8389957621693611, "learning_rate": 2.226317633284435e-05, "loss": 0.1707, "num_tokens": 13080691.0, "reward": -0.2684229016304016, "reward_std": 0.19070428609848022, "rewards/rollout_reward_func/mean": -0.2684229016304016, "rewards/rollout_reward_func/std": 0.19358457624912262, "sampling/importance_sampling_ratio/max": 1.4942532777786255, "sampling/importance_sampling_ratio/mean": 0.9998233318328857, "sampling/importance_sampling_ratio/min": 0.6275448203086853, "sampling/sampling_logp_difference/max": 0.4659402370452881, "sampling/sampling_logp_difference/mean": 0.015357425436377525, "step": 369, "step_time": 29.348080803998528 }, { "clip_ratio/high_max": 0.017195767490193248, "clip_ratio/high_mean": 0.008597883745096624, "clip_ratio/low_mean": 0.017034728080034256, "clip_ratio/low_min": 0.005736237042583525, "clip_ratio/region_mean": 0.025632611708715558, "entropy": 0.21312009077519178, "epoch": 0.0037, "grad_norm": 0.622150182723999, "kl": 0.8458880335092545, "learning_rate": 2.2098181373937283e-05, "loss": 0.1657, "step": 370, "step_time": 6.487901039996359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 1905.3125, "completions/mean_terminated_length": 1905.3125, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "entropy": 0.11238036584109068, "epoch": 0.00371, "frac_reward_zero_std": 0.0, "grad_norm": 0.8762730360031128, "kl": 0.6218791455030441, "learning_rate": 2.1934071109061663e-05, "loss": -0.0839, "num_tokens": 13167166.0, "reward": -0.24961228668689728, "reward_std": 0.223195880651474, "rewards/rollout_reward_func/mean": -0.24961228668689728, "rewards/rollout_reward_func/std": 0.271057665348053, "sampling/importance_sampling_ratio/max": 1.3225152492523193, "sampling/importance_sampling_ratio/mean": 0.9978280067443848, "sampling/importance_sampling_ratio/min": 0.5780985355377197, "sampling/sampling_logp_difference/max": 0.5480109453201294, "sampling/sampling_logp_difference/mean": 0.00838131457567215, "step": 371, "step_time": 28.937411421000434 }, { "clip_ratio/high_max": 0.00547036761417985, "clip_ratio/high_mean": 0.002735183807089925, "clip_ratio/low_mean": 0.004435262468177825, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00717044627526775, "entropy": 0.108968960121274, "epoch": 0.00372, "grad_norm": 0.48890888690948486, "kl": 0.5181532949209213, "learning_rate": 2.177085353672027e-05, "loss": -0.0857, "step": 372, "step_time": 6.621653077001611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2253.0, "completions/max_terminated_length": 2253.0, "completions/mean_length": 1952.34375, "completions/mean_terminated_length": 1952.34375, "completions/min_length": 1464.0, "completions/min_terminated_length": 1464.0, "entropy": 0.08189094765111804, "epoch": 0.00373, "frac_reward_zero_std": 0.0, "grad_norm": 0.38923245668411255, "kl": 0.40961431339383125, "learning_rate": 2.1608536611907304e-05, "loss": -0.0161, "num_tokens": 13255199.0, "reward": -0.2866019010543823, "reward_std": 0.2256014347076416, "rewards/rollout_reward_func/mean": -0.2866019010543823, "rewards/rollout_reward_func/std": 0.2427309900522232, "sampling/importance_sampling_ratio/max": 1.2562482357025146, "sampling/importance_sampling_ratio/mean": 0.9994159936904907, "sampling/importance_sampling_ratio/min": 0.7279756665229797, "sampling/sampling_logp_difference/max": 0.3174877166748047, "sampling/sampling_logp_difference/mean": 0.004523878917098045, "step": 373, "step_time": 29.437845348998962 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002631870564073324, "entropy": 0.08167525893077254, "epoch": 0.00374, "grad_norm": 0.30566293001174927, "kl": 0.41134730726480484, "learning_rate": 2.1447128245720683e-05, "loss": -0.0175, "step": 374, "step_time": 6.911214673997165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 1905.53125, "completions/mean_terminated_length": 1905.53125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.09276177268475294, "epoch": 0.00375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3552224934101105, "kl": 0.6797048784792423, "learning_rate": 2.1286636304976478e-05, "loss": -0.0313, "num_tokens": 13341731.0, "reward": -0.2700548768043518, "reward_std": 0.3465014696121216, "rewards/rollout_reward_func/mean": -0.2700548768043518, "rewards/rollout_reward_func/std": 0.33792394399642944, "sampling/importance_sampling_ratio/max": 1.3302781581878662, "sampling/importance_sampling_ratio/mean": 1.0008878707885742, "sampling/importance_sampling_ratio/min": 0.7343172430992126, "sampling/sampling_logp_difference/max": 0.3088141679763794, "sampling/sampling_logp_difference/mean": 0.006356541067361832, "step": 375, "step_time": 28.674229873000513 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.008062510867603123, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009364594239741564, "entropy": 0.09242307208478451, "epoch": 0.00376, "grad_norm": 0.2791822850704193, "kl": 0.7420783564448357, "learning_rate": 2.1127068611825453e-05, "loss": -0.0331, "step": 376, "step_time": 6.820720825004173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2150.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 1930.0625, "completions/mean_terminated_length": 1930.0625, "completions/min_length": 1047.0, "completions/min_terminated_length": 1047.0, "entropy": 0.08619042113423347, "epoch": 0.00377, "frac_reward_zero_std": 0.0, "grad_norm": 0.34076955914497375, "kl": 0.4414917305111885, "learning_rate": 2.0968432943371854e-05, "loss": -0.0106, "num_tokens": 13429015.0, "reward": -0.2070711851119995, "reward_std": 0.22700129449367523, "rewards/rollout_reward_func/mean": -0.2070711851119995, "rewards/rollout_reward_func/std": 0.22285392880439758, "sampling/importance_sampling_ratio/max": 1.5171291828155518, "sampling/importance_sampling_ratio/mean": 0.9980796575546265, "sampling/importance_sampling_ratio/min": 0.41376104950904846, "sampling/sampling_logp_difference/max": 0.8824666738510132, "sampling/sampling_logp_difference/mean": 0.007632775232195854, "step": 377, "step_time": 28.300105656000596 }, { "clip_ratio/high_max": 0.00786790787242353, "clip_ratio/high_mean": 0.005236037308350205, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007840204052627087, "entropy": 0.08302951790392399, "epoch": 0.00378, "grad_norm": 0.2146891951560974, "kl": 0.4409993104636669, "learning_rate": 2.0810737031294356e-05, "loss": -0.0118, "step": 378, "step_time": 7.3914534830000775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1898.96875, "completions/mean_terminated_length": 1898.96875, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "entropy": 0.09709073137491941, "epoch": 0.00379, "frac_reward_zero_std": 0.0, "grad_norm": 0.5405853390693665, "kl": 0.48799536004662514, "learning_rate": 2.0653988561469253e-05, "loss": -0.0269, "num_tokens": 13515290.0, "reward": -0.36859631538391113, "reward_std": 0.31511902809143066, "rewards/rollout_reward_func/mean": -0.36859631538391113, "rewards/rollout_reward_func/std": 0.3176507353782654, "sampling/importance_sampling_ratio/max": 1.6995066404342651, "sampling/importance_sampling_ratio/mean": 1.0009839534759521, "sampling/importance_sampling_ratio/min": 0.7279605865478516, "sampling/sampling_logp_difference/max": 0.5303380489349365, "sampling/sampling_logp_difference/mean": 0.00652437936514616, "step": 379, "step_time": 27.398741568000332 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.0037188964197412133, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0063507669838145375, "entropy": 0.0951674161478877, "epoch": 0.0038, "grad_norm": 0.27580299973487854, "kl": 0.4911961555480957, "learning_rate": 2.0498195173595807e-05, "loss": -0.0276, "step": 380, "step_time": 6.731428025999776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1847.5625, "completions/mean_terminated_length": 1847.5625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.07391469180583954, "epoch": 0.00381, "frac_reward_zero_std": 0.0, "grad_norm": 0.4161422550678253, "kl": 0.494118332862854, "learning_rate": 2.0343364460823976e-05, "loss": -0.052, "num_tokens": 13599882.0, "reward": -0.2979568839073181, "reward_std": 0.30220669507980347, "rewards/rollout_reward_func/mean": -0.2979568839073181, "rewards/rollout_reward_func/std": 0.29856497049331665, "sampling/importance_sampling_ratio/max": 1.3388031721115112, "sampling/importance_sampling_ratio/mean": 1.0005851984024048, "sampling/importance_sampling_ratio/min": 0.6929327845573425, "sampling/sampling_logp_difference/max": 0.3668223023414612, "sampling/sampling_logp_difference/mean": 0.005544058978557587, "step": 381, "step_time": 27.365391216997523 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.006651345174759626, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.007953428546898067, "entropy": 0.07180237025022507, "epoch": 0.00382, "grad_norm": 0.24212320148944855, "kl": 0.5075628459453583, "learning_rate": 2.0189503969384236e-05, "loss": -0.0534, "step": 382, "step_time": 7.000674497001455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 1923.75, "completions/mean_terminated_length": 1923.75, "completions/min_length": 1762.0, "completions/min_terminated_length": 1762.0, "entropy": 0.057647344190627337, "epoch": 0.00383, "frac_reward_zero_std": 0.0, "grad_norm": 0.425406813621521, "kl": 0.3763584513217211, "learning_rate": 2.0036621198219887e-05, "loss": -0.0019, "num_tokens": 13686934.0, "reward": -0.13870276510715485, "reward_std": 0.2559167742729187, "rewards/rollout_reward_func/mean": -0.13870276510715485, "rewards/rollout_reward_func/std": 0.2570273280143738, "sampling/importance_sampling_ratio/max": 1.276544213294983, "sampling/importance_sampling_ratio/mean": 1.0000559091567993, "sampling/importance_sampling_ratio/min": 0.7963462471961975, "sampling/sampling_logp_difference/max": 0.2441565990447998, "sampling/sampling_logp_difference/mean": 0.003174185287207365, "step": 383, "step_time": 27.989172109999345 }, { "clip_ratio/high_max": 0.007708333432674408, "clip_ratio/high_mean": 0.003854166716337204, "clip_ratio/low_mean": 0.002659574383869767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006513741100206971, "entropy": 0.05515117151662707, "epoch": 0.00384, "grad_norm": 0.2515657842159271, "kl": 0.392648845911026, "learning_rate": 1.988472359862151e-05, "loss": -0.0035, "step": 384, "step_time": 6.7562601110003015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 1988.875, "completions/mean_terminated_length": 1988.875, "completions/min_length": 1804.0, "completions/min_terminated_length": 1804.0, "entropy": 0.07965095434337854, "epoch": 0.00385, "frac_reward_zero_std": 0.0, "grad_norm": 0.37832286953926086, "kl": 0.4880429431796074, "learning_rate": 1.9733818573863822e-05, "loss": 0.0053, "num_tokens": 13776161.0, "reward": -0.2682172954082489, "reward_std": 0.23692333698272705, "rewards/rollout_reward_func/mean": -0.2682172954082489, "rewards/rollout_reward_func/std": 0.23154155910015106, "sampling/importance_sampling_ratio/max": 1.3514435291290283, "sampling/importance_sampling_ratio/mean": 0.9994603395462036, "sampling/importance_sampling_ratio/min": 0.4945390224456787, "sampling/sampling_logp_difference/max": 0.7041292190551758, "sampling/sampling_logp_difference/mean": 0.00594722107052803, "step": 385, "step_time": 28.110362606001218 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 0.07942617405205965, "epoch": 0.00386, "grad_norm": 0.36147642135620117, "kl": 0.4841915927827358, "learning_rate": 1.9583913478844825e-05, "loss": 0.0043, "step": 386, "step_time": 6.70776969399958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2202.0, "completions/max_terminated_length": 2202.0, "completions/mean_length": 1930.6875, "completions/mean_terminated_length": 1930.6875, "completions/min_length": 1402.0, "completions/min_terminated_length": 1402.0, "entropy": 0.06152026914060116, "epoch": 0.00387, "frac_reward_zero_std": 0.0, "grad_norm": 0.5614769458770752, "kl": 0.4931205380707979, "learning_rate": 1.943501561972737e-05, "loss": 0.0317, "num_tokens": 13863458.0, "reward": -0.24557562172412872, "reward_std": 0.3272491991519928, "rewards/rollout_reward_func/mean": -0.24557562172412872, "rewards/rollout_reward_func/std": 0.35364988446235657, "sampling/importance_sampling_ratio/max": 1.4333710670471191, "sampling/importance_sampling_ratio/mean": 1.001246690750122, "sampling/importance_sampling_ratio/min": 0.869192361831665, "sampling/sampling_logp_difference/max": 0.3600289821624756, "sampling/sampling_logp_difference/mean": 0.0027777059003710747, "step": 387, "step_time": 27.935356747000696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004375853925012052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004375853925012052, "entropy": 0.06135493563488126, "epoch": 0.00388, "grad_norm": 0.35176539421081543, "kl": 0.4906487688422203, "learning_rate": 1.9287132253583058e-05, "loss": 0.0285, "step": 388, "step_time": 7.468276888997934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 2011.6875, "completions/mean_terminated_length": 2011.6875, "completions/min_length": 1852.0, "completions/min_terminated_length": 1852.0, "entropy": 0.0688346535898745, "epoch": 0.00389, "frac_reward_zero_std": 0.0, "grad_norm": 0.2803843915462494, "kl": 0.36022088304162025, "learning_rate": 1.914027058803851e-05, "loss": 0.0036, "num_tokens": 13953411.0, "reward": -0.27870845794677734, "reward_std": 0.19971157610416412, "rewards/rollout_reward_func/mean": -0.27870845794677734, "rewards/rollout_reward_func/std": 0.20552389323711395, "sampling/importance_sampling_ratio/max": 1.1597851514816284, "sampling/importance_sampling_ratio/mean": 0.9980392456054688, "sampling/importance_sampling_ratio/min": 0.6000890135765076, "sampling/sampling_logp_difference/max": 0.5106773376464844, "sampling/sampling_logp_difference/mean": 0.004868931137025356, "step": 389, "step_time": 28.26693949699984 }, { "clip_ratio/high_max": 0.00786790787242353, "clip_ratio/high_mean": 0.003933953936211765, "clip_ratio/low_mean": 0.002631870564073324, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006565824500285089, "entropy": 0.06820002570748329, "epoch": 0.0039, "grad_norm": 0.22925035655498505, "kl": 0.3574036881327629, "learning_rate": 1.8994437780924103e-05, "loss": 0.0018, "step": 390, "step_time": 6.677996749000158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2260.0, "completions/max_terminated_length": 2260.0, "completions/mean_length": 1986.84375, "completions/mean_terminated_length": 1986.84375, "completions/min_length": 1803.0, "completions/min_terminated_length": 1803.0, "entropy": 0.08141213655471802, "epoch": 0.00391, "frac_reward_zero_std": 0.0, "grad_norm": 0.6056037545204163, "kl": 0.4483014978468418, "learning_rate": 1.884964093992513e-05, "loss": 0.0037, "num_tokens": 14042557.0, "reward": -0.3109396994113922, "reward_std": 0.24177302420139313, "rewards/rollout_reward_func/mean": -0.3109396994113922, "rewards/rollout_reward_func/std": 0.2639472186565399, "sampling/importance_sampling_ratio/max": 1.4637600183486938, "sampling/importance_sampling_ratio/mean": 0.9985758662223816, "sampling/importance_sampling_ratio/min": 0.31669867038726807, "sampling/sampling_logp_difference/max": 1.1498045921325684, "sampling/sampling_logp_difference/mean": 0.006633114069700241, "step": 391, "step_time": 29.398411664997184 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.006594732985831797, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007896816357970238, "entropy": 0.08251305110752583, "epoch": 0.00392, "grad_norm": 0.3521493375301361, "kl": 0.4173557721078396, "learning_rate": 1.870588712223535e-05, "loss": 0.002, "step": 392, "step_time": 7.603408847002356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2118.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 1950.125, "completions/mean_terminated_length": 1950.125, "completions/min_length": 1810.0, "completions/min_terminated_length": 1810.0, "entropy": 0.06546863308176398, "epoch": 0.00393, "frac_reward_zero_std": 0.0, "grad_norm": 0.4908037483692169, "kl": 0.32551200315356255, "learning_rate": 1.8563183334213013e-05, "loss": 0.0036, "num_tokens": 14130496.0, "reward": -0.26419106125831604, "reward_std": 0.2656874358654022, "rewards/rollout_reward_func/mean": -0.26419106125831604, "rewards/rollout_reward_func/std": 0.25990426540374756, "sampling/importance_sampling_ratio/max": 1.444229006767273, "sampling/importance_sampling_ratio/mean": 0.9988200664520264, "sampling/importance_sampling_ratio/min": 0.48279985785484314, "sampling/sampling_logp_difference/max": 0.7281531095504761, "sampling/sampling_logp_difference/mean": 0.004563077352941036, "step": 393, "step_time": 27.33159875599995 }, { "clip_ratio/high_max": 0.007978723151609302, "clip_ratio/high_mean": 0.003989361575804651, "clip_ratio/low_mean": 0.003961657756008208, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007951019331812859, "entropy": 0.06400075182318687, "epoch": 0.00394, "grad_norm": 0.27093592286109924, "kl": 0.32389672100543976, "learning_rate": 1.842153653103942e-05, "loss": 0.002, "step": 394, "step_time": 6.59037972599981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 1911.0625, "completions/mean_terminated_length": 1911.0625, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "entropy": 0.0783872022293508, "epoch": 0.00395, "frac_reward_zero_std": 0.0, "grad_norm": 0.5759827494621277, "kl": 0.36734145879745483, "learning_rate": 1.828095361637995e-05, "loss": 0.0096, "num_tokens": 14217146.0, "reward": -0.28406065702438354, "reward_std": 0.3388478755950928, "rewards/rollout_reward_func/mean": -0.28406065702438354, "rewards/rollout_reward_func/std": 0.35768207907676697, "sampling/importance_sampling_ratio/max": 1.5633175373077393, "sampling/importance_sampling_ratio/mean": 1.0008362531661987, "sampling/importance_sampling_ratio/min": 1.590613351254433e-06, "sampling/sampling_logp_difference/max": 13.351390838623047, "sampling/sampling_logp_difference/mean": 0.022647429257631302, "step": 395, "step_time": 28.129317499002354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004227543366141617, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004227543366141617, "entropy": 0.0782654620707035, "epoch": 0.00396, "grad_norm": 0.402500718832016, "kl": 0.3707554414868355, "learning_rate": 1.8141441442047516e-05, "loss": 0.0084, "step": 396, "step_time": 7.138548373002777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 1933.53125, "completions/mean_terminated_length": 1933.53125, "completions/min_length": 1577.0, "completions/min_terminated_length": 1577.0, "entropy": 0.06763228960335255, "epoch": 0.00397, "frac_reward_zero_std": 0.0, "grad_norm": 0.47304287552833557, "kl": 0.44626615568995476, "learning_rate": 1.800300680766868e-05, "loss": -0.0035, "num_tokens": 14304524.0, "reward": -0.33282461762428284, "reward_std": 0.24420557916164398, "rewards/rollout_reward_func/mean": -0.33282461762428284, "rewards/rollout_reward_func/std": 0.25186458230018616, "sampling/importance_sampling_ratio/max": 2.400805950164795, "sampling/importance_sampling_ratio/mean": 1.0040220022201538, "sampling/importance_sampling_ratio/min": 0.8277620077133179, "sampling/sampling_logp_difference/max": 0.8758044242858887, "sampling/sampling_logp_difference/mean": 0.0061488766223192215, "step": 397, "step_time": 27.675827157996537 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.003906250116415322, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006538120796903968, "entropy": 0.06886283215135336, "epoch": 0.00398, "grad_norm": 0.2867587208747864, "kl": 0.43570801988244057, "learning_rate": 1.78656564603522e-05, "loss": -0.0055, "step": 398, "step_time": 7.420786632003001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1963.4375, "completions/mean_terminated_length": 1963.4375, "completions/min_length": 1440.0, "completions/min_terminated_length": 1440.0, "entropy": 0.05722304107621312, "epoch": 0.00399, "frac_reward_zero_std": 0.0, "grad_norm": 0.28128838539123535, "kl": 0.3284569978713989, "learning_rate": 1.7729397094360248e-05, "loss": 0.0063, "num_tokens": 14392930.0, "reward": -0.24342823028564453, "reward_std": 0.3531992435455322, "rewards/rollout_reward_func/mean": -0.24342823028564453, "rewards/rollout_reward_func/std": 0.3587290942668915, "sampling/importance_sampling_ratio/max": 1.4372109174728394, "sampling/importance_sampling_ratio/mean": 1.0008769035339355, "sampling/importance_sampling_ratio/min": 0.8222236633300781, "sampling/sampling_logp_difference/max": 0.3627042770385742, "sampling/sampling_logp_difference/mean": 0.0031419687438756227, "step": 399, "step_time": 27.739881139001227 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.002659574383869767, "clip_ratio/low_mean": 0.006565824500285089, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.009225398884154856, "entropy": 0.05765126412734389, "epoch": 0.004, "grad_norm": 0.10893956571817398, "kl": 0.32365728728473186, "learning_rate": 1.7594235350782084e-05, "loss": 0.0048, "step": 400, "step_time": 6.8715451049993135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2205.0, "completions/max_terminated_length": 2205.0, "completions/mean_length": 1983.90625, "completions/mean_terminated_length": 1983.90625, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "entropy": 0.0640164795331657, "epoch": 0.00401, "frac_reward_zero_std": 0.0, "grad_norm": 0.767676055431366, "kl": 0.3235536776483059, "learning_rate": 1.7460177817210404e-05, "loss": -0.01, "num_tokens": 14482042.0, "reward": -0.2899346649646759, "reward_std": 0.3241998553276062, "rewards/rollout_reward_func/mean": -0.2899346649646759, "rewards/rollout_reward_func/std": 0.3305411636829376, "sampling/importance_sampling_ratio/max": 2.4194326400756836, "sampling/importance_sampling_ratio/mean": 1.00095796585083, "sampling/importance_sampling_ratio/min": 0.7277583479881287, "sampling/sampling_logp_difference/max": 0.8835331201553345, "sampling/sampling_logp_difference/mean": 0.004069449380040169, "step": 401, "step_time": 28.2060206510032 }, { "clip_ratio/high_max": 0.007812500232830644, "clip_ratio/high_mean": 0.003906250116415322, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.0651022894307971, "epoch": 0.00402, "grad_norm": 0.3754366338253021, "kl": 0.3202049024403095, "learning_rate": 1.732723102742027e-05, "loss": -0.0121, "step": 402, "step_time": 6.75553004899848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 1988.65625, "completions/mean_terminated_length": 1988.65625, "completions/min_length": 1802.0, "completions/min_terminated_length": 1802.0, "entropy": 0.0667225350625813, "epoch": 0.00403, "frac_reward_zero_std": 0.0, "grad_norm": 0.6768569350242615, "kl": 0.36207200959324837, "learning_rate": 1.7195401461050656e-05, "loss": 0.0034, "num_tokens": 14571269.0, "reward": -0.22941750288009644, "reward_std": 0.1761663258075714, "rewards/rollout_reward_func/mean": -0.22941750288009644, "rewards/rollout_reward_func/std": 0.22676746547222137, "sampling/importance_sampling_ratio/max": 2.5603930950164795, "sampling/importance_sampling_ratio/mean": 1.0030498504638672, "sampling/importance_sampling_ratio/min": 0.46061763167381287, "sampling/sampling_logp_difference/max": 0.9401607513427734, "sampling/sampling_logp_difference/mean": 0.005978971719741821, "step": 403, "step_time": 28.864905211003133 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.006567029166035354, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007896816357970238, "entropy": 0.06676412979140878, "epoch": 0.00404, "grad_norm": 0.4834277033805847, "kl": 0.49166369810700417, "learning_rate": 1.7064695543288636e-05, "loss": 0.003, "step": 404, "step_time": 6.761238670998864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2144.0, "completions/max_terminated_length": 2144.0, "completions/mean_length": 1910.21875, "completions/mean_terminated_length": 1910.21875, "completions/min_length": 1389.0, "completions/min_terminated_length": 1389.0, "entropy": 0.06678978865966201, "epoch": 0.00405, "frac_reward_zero_std": 0.0, "grad_norm": 0.4136320650577545, "kl": 0.5003938190639019, "learning_rate": 1.6935119644556266e-05, "loss": -0.0053, "num_tokens": 14657899.0, "reward": -0.3465886116027832, "reward_std": 0.21207541227340698, "rewards/rollout_reward_func/mean": -0.3465886116027832, "rewards/rollout_reward_func/std": 0.2211671620607376, "sampling/importance_sampling_ratio/max": 1.3890410661697388, "sampling/importance_sampling_ratio/mean": 1.0007951259613037, "sampling/importance_sampling_ratio/min": 0.32478851079940796, "sampling/sampling_logp_difference/max": 1.124580979347229, "sampling/sampling_logp_difference/mean": 0.006082427687942982, "step": 405, "step_time": 27.70942749200003 }, { "clip_ratio/high_max": 0.007812500232830644, "clip_ratio/high_mean": 0.003906250116415322, "clip_ratio/low_mean": 0.0014880952658131719, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005394345382228494, "entropy": 0.07010808167979121, "epoch": 0.00406, "grad_norm": 0.34199944138526917, "kl": 0.48158348351716995, "learning_rate": 1.6806680080200022e-05, "loss": -0.0072, "step": 406, "step_time": 7.114952855999945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2160.0, "completions/max_terminated_length": 2160.0, "completions/mean_length": 1919.53125, "completions/mean_terminated_length": 1919.53125, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "entropy": 0.09650693833827972, "epoch": 0.00407, "frac_reward_zero_std": 0.0, "grad_norm": 0.49600881338119507, "kl": 0.7773862816393375, "learning_rate": 1.6679383110183078e-05, "loss": 0.0031, "num_tokens": 14744874.0, "reward": -0.2790583372116089, "reward_std": 0.3491203188896179, "rewards/rollout_reward_func/mean": -0.2790583372116089, "rewards/rollout_reward_func/std": 0.3535960018634796, "sampling/importance_sampling_ratio/max": 1.209394097328186, "sampling/importance_sampling_ratio/mean": 0.9990273714065552, "sampling/importance_sampling_ratio/min": 0.5770361423492432, "sampling/sampling_logp_difference/max": 0.5498504638671875, "sampling/sampling_logp_difference/mean": 0.007050462067127228, "step": 407, "step_time": 28.65530228100033 }, { "clip_ratio/high_max": 0.003289473708719015, "clip_ratio/high_mean": 0.0029468201100826263, "clip_ratio/low_mean": 0.005236037308350205, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.008182857418432832, "entropy": 0.09934874624013901, "epoch": 0.00408, "grad_norm": 0.3833370506763458, "kl": 0.7228484712541103, "learning_rate": 1.6553234938780157e-05, "loss": 0.0011, "step": 408, "step_time": 6.7223014699993655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 1961.5, "completions/mean_terminated_length": 1961.5, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.09396483981981874, "epoch": 0.00409, "frac_reward_zero_std": 0.0, "grad_norm": 40.50089645385742, "kl": 12.110166233032942, "learning_rate": 1.6428241714275178e-05, "loss": 0.5391, "num_tokens": 14833229.0, "reward": -0.25999051332473755, "reward_std": 0.3298472762107849, "rewards/rollout_reward_func/mean": -0.25999051332473755, "rewards/rollout_reward_func/std": 0.3832211196422577, "sampling/importance_sampling_ratio/max": 1.4515888690948486, "sampling/importance_sampling_ratio/mean": 0.993827223777771, "sampling/importance_sampling_ratio/min": 3.363699596636849e-11, "sampling/sampling_logp_difference/max": 24.115394592285156, "sampling/sampling_logp_difference/mean": 0.05416573956608772, "step": 409, "step_time": 27.594794661003107 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.003906250116415322, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "entropy": 0.09593612561002374, "epoch": 0.0041, "grad_norm": 15.290627479553223, "kl": 4.838979959487915, "learning_rate": 1.6304409528661554e-05, "loss": 0.2835, "step": 410, "step_time": 6.780332135000208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2214.0, "completions/max_terminated_length": 2214.0, "completions/mean_length": 1972.125, "completions/mean_terminated_length": 1972.125, "completions/min_length": 1734.0, "completions/min_terminated_length": 1734.0, "entropy": 0.08866999670863152, "epoch": 0.00411, "frac_reward_zero_std": 0.0, "grad_norm": 0.6107754707336426, "kl": 0.7819295134395361, "learning_rate": 1.6181744417345315e-05, "loss": 0.0021, "num_tokens": 14921897.0, "reward": -0.27578091621398926, "reward_std": 0.2863690257072449, "rewards/rollout_reward_func/mean": -0.27578091621398926, "rewards/rollout_reward_func/std": 0.33068010210990906, "sampling/importance_sampling_ratio/max": 1.4013584852218628, "sampling/importance_sampling_ratio/mean": 0.9988202452659607, "sampling/importance_sampling_ratio/min": 0.6421798467636108, "sampling/sampling_logp_difference/max": 0.4428868293762207, "sampling/sampling_logp_difference/mean": 0.005887433886528015, "step": 411, "step_time": 28.66713555300157 }, { "clip_ratio/high_max": 0.007923315512016416, "clip_ratio/high_mean": 0.005263741128146648, "clip_ratio/low_mean": 0.005291444947943091, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.01055518607608974, "entropy": 0.0914617357775569, "epoch": 0.00412, "grad_norm": 0.350209504365921, "kl": 0.6747572626918554, "learning_rate": 1.606025235885093e-05, "loss": -0.0004, "step": 412, "step_time": 6.765173073998085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 1933.6875, "completions/mean_terminated_length": 1933.6875, "completions/min_length": 1330.0, "completions/min_terminated_length": 1330.0, "entropy": 0.09031174331903458, "epoch": 0.00413, "frac_reward_zero_std": 0.0, "grad_norm": 0.44795361161231995, "kl": 0.5164077840745449, "learning_rate": 1.5939939274529924e-05, "loss": -0.0051, "num_tokens": 15009297.0, "reward": -0.3568209409713745, "reward_std": 0.237154021859169, "rewards/rollout_reward_func/mean": -0.3568209409713745, "rewards/rollout_reward_func/std": 0.24314400553703308, "sampling/importance_sampling_ratio/max": 1.2554631233215332, "sampling/importance_sampling_ratio/mean": 1.0000483989715576, "sampling/importance_sampling_ratio/min": 0.7007449269294739, "sampling/sampling_logp_difference/max": 0.35561132431030273, "sampling/sampling_logp_difference/mean": 0.005155131220817566, "step": 413, "step_time": 28.91041491699798 }, { "clip_ratio/high_max": 0.00786790787242353, "clip_ratio/high_mean": 0.005236037191934884, "clip_ratio/low_mean": 0.005236037308350205, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.010472074500285089, "entropy": 0.09483368787914515, "epoch": 0.00414, "grad_norm": 0.38056841492652893, "kl": 0.49122459068894386, "learning_rate": 1.5820811028272302e-05, "loss": -0.0064, "step": 414, "step_time": 6.587977896002485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 2012.21875, "completions/mean_terminated_length": 2012.21875, "completions/min_length": 1818.0, "completions/min_terminated_length": 1818.0, "entropy": 0.09649765118956566, "epoch": 0.00415, "frac_reward_zero_std": 0.0, "grad_norm": 0.4328184723854065, "kl": 0.3752498086541891, "learning_rate": 1.570287342622072e-05, "loss": 0.0049, "num_tokens": 15099275.0, "reward": -0.3733634352684021, "reward_std": 0.2802848219871521, "rewards/rollout_reward_func/mean": -0.3733634352684021, "rewards/rollout_reward_func/std": 0.2895722985267639, "sampling/importance_sampling_ratio/max": 1.705918788909912, "sampling/importance_sampling_ratio/mean": 0.9999206066131592, "sampling/importance_sampling_ratio/min": 0.6895296573638916, "sampling/sampling_logp_difference/max": 0.5341038703918457, "sampling/sampling_logp_difference/mean": 0.005279987119138241, "step": 415, "step_time": 28.5543391909996 }, { "clip_ratio/high_max": 0.005376965738832951, "clip_ratio/high_mean": 0.0026884828694164753, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003990566241554916, "entropy": 0.10054922755807638, "epoch": 0.00416, "grad_norm": 0.36900264024734497, "kl": 0.3719754461199045, "learning_rate": 1.558613221648751e-05, "loss": 0.0037, "step": 416, "step_time": 7.3913587070001086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 1937.625, "completions/mean_terminated_length": 1937.625, "completions/min_length": 1200.0, "completions/min_terminated_length": 1200.0, "entropy": 0.12245809007436037, "epoch": 0.00417, "frac_reward_zero_std": 0.0, "grad_norm": 0.5259190797805786, "kl": 0.49605320766568184, "learning_rate": 1.547059308887454e-05, "loss": 0.0125, "num_tokens": 15186815.0, "reward": -0.29868781566619873, "reward_std": 0.27307265996932983, "rewards/rollout_reward_func/mean": -0.29868781566619873, "rewards/rollout_reward_func/std": 0.3266037106513977, "sampling/importance_sampling_ratio/max": 1.476646065711975, "sampling/importance_sampling_ratio/mean": 1.000206470489502, "sampling/importance_sampling_ratio/min": 0.6835705041885376, "sampling/sampling_logp_difference/max": 0.3897733688354492, "sampling/sampling_logp_difference/mean": 0.00660916231572628, "step": 417, "step_time": 28.88536455800022 }, { "clip_ratio/high_max": 0.010872134938836098, "clip_ratio/high_mean": 0.005436067469418049, "clip_ratio/low_mean": 0.0027502417797222733, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008186309249140322, "entropy": 0.12709292583167553, "epoch": 0.00418, "grad_norm": 0.48511430621147156, "kl": 0.49709195643663406, "learning_rate": 1.5356261674595888e-05, "loss": 0.0114, "step": 418, "step_time": 6.642192526000144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 1941.15625, "completions/mean_terminated_length": 1941.15625, "completions/min_length": 1334.0, "completions/min_terminated_length": 1334.0, "entropy": 0.11542581953108311, "epoch": 0.00419, "frac_reward_zero_std": 0.0, "grad_norm": 0.32973921298980713, "kl": 0.39978159964084625, "learning_rate": 1.5243143546003393e-05, "loss": -0.0101, "num_tokens": 15274478.0, "reward": -0.3560764491558075, "reward_std": 0.21171441674232483, "rewards/rollout_reward_func/mean": -0.3560764491558075, "rewards/rollout_reward_func/std": 0.21297945082187653, "sampling/importance_sampling_ratio/max": 1.5840860605239868, "sampling/importance_sampling_ratio/mean": 1.0020616054534912, "sampling/importance_sampling_ratio/min": 0.8293798565864563, "sampling/sampling_logp_difference/max": 0.46000760793685913, "sampling/sampling_logp_difference/mean": 0.005377276800572872, "step": 419, "step_time": 27.47050852400207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004020759486593306, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004020759486593306, "entropy": 0.11798083037137985, "epoch": 0.0042, "grad_norm": 0.2582690119743347, "kl": 0.39797788858413696, "learning_rate": 1.5131244216315056e-05, "loss": -0.0112, "step": 420, "step_time": 7.117073044997596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 1980.8125, "completions/mean_terminated_length": 1980.8125, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "entropy": 0.14186043478548527, "epoch": 0.00421, "frac_reward_zero_std": 0.0, "grad_norm": 0.528929591178894, "kl": 0.43039626628160477, "learning_rate": 1.5020569139346325e-05, "loss": -0.0219, "num_tokens": 15363488.0, "reward": -0.2897564172744751, "reward_std": 0.28248435258865356, "rewards/rollout_reward_func/mean": -0.2897564172744751, "rewards/rollout_reward_func/std": 0.28208380937576294, "sampling/importance_sampling_ratio/max": 1.485063910484314, "sampling/importance_sampling_ratio/mean": 1.0004338026046753, "sampling/importance_sampling_ratio/min": 0.8532211780548096, "sampling/sampling_logp_difference/max": 0.39545774459838867, "sampling/sampling_logp_difference/mean": 0.006948579102754593, "step": 421, "step_time": 28.108789879999676 }, { "clip_ratio/high_max": 0.008041518973186612, "clip_ratio/high_mean": 0.004020759486593306, "clip_ratio/low_mean": 0.0067658546613529325, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010786614147946239, "entropy": 0.14119906350970268, "epoch": 0.00422, "grad_norm": 0.23460648953914642, "kl": 0.4257427714765072, "learning_rate": 1.4911123709244329e-05, "loss": -0.0233, "step": 422, "step_time": 7.479982175000259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2180.0, "completions/max_terminated_length": 2180.0, "completions/mean_length": 1967.1875, "completions/mean_terminated_length": 1967.1875, "completions/min_length": 1727.0, "completions/min_terminated_length": 1727.0, "entropy": 0.13894116692245007, "epoch": 0.00423, "frac_reward_zero_std": 0.0, "grad_norm": 0.5666837692260742, "kl": 0.48937472701072693, "learning_rate": 1.4802913260224927e-05, "loss": 0.0052, "num_tokens": 15451997.0, "reward": -0.37778857350349426, "reward_std": 0.28945112228393555, "rewards/rollout_reward_func/mean": -0.37778857350349426, "rewards/rollout_reward_func/std": 0.279699444770813, "sampling/importance_sampling_ratio/max": 1.3201322555541992, "sampling/importance_sampling_ratio/mean": 1.0015673637390137, "sampling/importance_sampling_ratio/min": 0.7411907315254211, "sampling/sampling_logp_difference/max": 0.29949724674224854, "sampling/sampling_logp_difference/mean": 0.007674708031117916, "step": 423, "step_time": 28.307244596997407 }, { "clip_ratio/high_max": 0.007925724843516946, "clip_ratio/high_mean": 0.003962862421758473, "clip_ratio/low_mean": 0.0027186761144548655, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00668153865262866, "entropy": 0.13741643726825714, "epoch": 0.00424, "grad_norm": 0.31505510210990906, "kl": 0.49263686686754227, "learning_rate": 1.469594306631273e-05, "loss": 0.0044, "step": 424, "step_time": 6.717388055998526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 1958.25, "completions/mean_terminated_length": 1958.25, "completions/min_length": 1755.0, "completions/min_terminated_length": 1755.0, "entropy": 0.13471302576363087, "epoch": 0.00425, "frac_reward_zero_std": 0.0, "grad_norm": 0.38305002450942993, "kl": 0.5495638363063335, "learning_rate": 1.4590218341084088e-05, "loss": -0.0018, "num_tokens": 15540196.0, "reward": -0.33147597312927246, "reward_std": 0.2107001692056656, "rewards/rollout_reward_func/mean": -0.33147597312927246, "rewards/rollout_reward_func/std": 0.2261766940355301, "sampling/importance_sampling_ratio/max": 1.2554430961608887, "sampling/importance_sampling_ratio/mean": 0.999242901802063, "sampling/importance_sampling_ratio/min": 0.7219120860099792, "sampling/sampling_logp_difference/max": 0.3258519172668457, "sampling/sampling_logp_difference/mean": 0.007209521718323231, "step": 425, "step_time": 28.595198797000194 }, { "clip_ratio/high_max": 0.0027173913549631834, "clip_ratio/high_mean": 0.0026607790496200323, "clip_ratio/low_mean": 0.0027475846000015736, "clip_ratio/low_min": 0.0027173913549631834, "clip_ratio/region_mean": 0.005408363649621606, "entropy": 0.13353394530713558, "epoch": 0.00426, "grad_norm": 0.3784203827381134, "kl": 0.5728055462241173, "learning_rate": 1.4485744237412947e-05, "loss": -0.0035, "step": 426, "step_time": 6.679047075000199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1969.03125, "completions/mean_terminated_length": 1969.03125, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "entropy": 0.12441228795796633, "epoch": 0.00427, "frac_reward_zero_std": 0.0, "grad_norm": 0.5317200422286987, "kl": 0.3626866117119789, "learning_rate": 1.4382525847219732e-05, "loss": -0.0021, "num_tokens": 15628801.0, "reward": -0.3563631772994995, "reward_std": 0.40095508098602295, "rewards/rollout_reward_func/mean": -0.3563631772994995, "rewards/rollout_reward_func/std": 0.40249666571617126, "sampling/importance_sampling_ratio/max": 1.1764429807662964, "sampling/importance_sampling_ratio/mean": 0.9985589981079102, "sampling/importance_sampling_ratio/min": 0.7163174152374268, "sampling/sampling_logp_difference/max": 0.3336319327354431, "sampling/sampling_logp_difference/mean": 0.005529558286070824, "step": 427, "step_time": 28.99827540199658 }, { "clip_ratio/high_max": 0.007925724843516946, "clip_ratio/high_mean": 0.003962862421758473, "clip_ratio/low_mean": 0.002631870564073324, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006594732985831797, "entropy": 0.12293144688010216, "epoch": 0.00428, "grad_norm": 0.2547270655632019, "kl": 0.3724898211658001, "learning_rate": 1.4280568201223166e-05, "loss": -0.0027, "step": 428, "step_time": 6.811676973999056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 1918.78125, "completions/mean_terminated_length": 1918.78125, "completions/min_length": 1679.0, "completions/min_terminated_length": 1679.0, "entropy": 0.11178404092788696, "epoch": 0.00429, "frac_reward_zero_std": 0.0, "grad_norm": 0.3992089629173279, "kl": 0.39855753630399704, "learning_rate": 1.4179876268695064e-05, "loss": -0.0056, "num_tokens": 15715730.0, "reward": -0.2960454523563385, "reward_std": 0.32936036586761475, "rewards/rollout_reward_func/mean": -0.2960454523563385, "rewards/rollout_reward_func/std": 0.3272988796234131, "sampling/importance_sampling_ratio/max": 1.236393928527832, "sampling/importance_sampling_ratio/mean": 0.9999960660934448, "sampling/importance_sampling_ratio/min": 0.7753285765647888, "sampling/sampling_logp_difference/max": 0.25446832180023193, "sampling/sampling_logp_difference/mean": 0.005039893090724945, "step": 429, "step_time": 28.262166099999376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0013888889225199819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888889225199819, "entropy": 0.11082301894202828, "epoch": 0.0043, "grad_norm": 0.36299464106559753, "kl": 0.40001294389367104, "learning_rate": 1.408045495721816e-05, "loss": -0.0067, "step": 430, "step_time": 7.0367676580008265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 1969.78125, "completions/mean_terminated_length": 1969.78125, "completions/min_length": 1424.0, "completions/min_terminated_length": 1424.0, "entropy": 0.11157228611409664, "epoch": 0.00431, "frac_reward_zero_std": 0.0, "grad_norm": 0.3754769563674927, "kl": 0.39178966730833054, "learning_rate": 1.398230911244692e-05, "loss": 0.0077, "num_tokens": 15804344.0, "reward": -0.22196847200393677, "reward_std": 0.3331807255744934, "rewards/rollout_reward_func/mean": -0.22196847200393677, "rewards/rollout_reward_func/std": 0.3569449782371521, "sampling/importance_sampling_ratio/max": 1.3215572834014893, "sampling/importance_sampling_ratio/mean": 1.0000885725021362, "sampling/importance_sampling_ratio/min": 0.7811004519462585, "sampling/sampling_logp_difference/max": 0.2788107693195343, "sampling/sampling_logp_difference/mean": 0.004692981950938702, "step": 431, "step_time": 27.485871010001574 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.002659574383869767, "clip_ratio/low_mean": 0.003906250116415322, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006565824500285089, "entropy": 0.11168467905372381, "epoch": 0.00432, "grad_norm": 0.2530343234539032, "kl": 0.39404914900660515, "learning_rate": 1.3885443517871358e-05, "loss": 0.0062, "step": 432, "step_time": 6.685317585999655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 1994.1875, "completions/mean_terminated_length": 1994.1875, "completions/min_length": 1832.0, "completions/min_terminated_length": 1832.0, "entropy": 0.13002021331340075, "epoch": 0.00433, "frac_reward_zero_std": 0.0, "grad_norm": 0.5809800028800964, "kl": 0.4483966492116451, "learning_rate": 1.3789862894583897e-05, "loss": 0.0051, "num_tokens": 15893738.0, "reward": -0.32575100660324097, "reward_std": 0.3211192488670349, "rewards/rollout_reward_func/mean": -0.32575100660324097, "rewards/rollout_reward_func/std": 0.3380368649959564, "sampling/importance_sampling_ratio/max": 2.0213489532470703, "sampling/importance_sampling_ratio/mean": 1.0041065216064453, "sampling/importance_sampling_ratio/min": 0.6525455117225647, "sampling/sampling_logp_difference/max": 0.7037651538848877, "sampling/sampling_logp_difference/mean": 0.009868120774626732, "step": 433, "step_time": 28.534948587001054 }, { "clip_ratio/high_max": 0.005321558099240065, "clip_ratio/high_mean": 0.003990566241554916, "clip_ratio/low_mean": 0.005236037308350205, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009226603549905121, "entropy": 0.1312422128394246, "epoch": 0.00434, "grad_norm": 0.6198850870132446, "kl": 0.45084450021386147, "learning_rate": 1.369557190104926e-05, "loss": 0.0045, "step": 434, "step_time": 7.259805867002797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2172.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 1916.625, "completions/mean_terminated_length": 1916.625, "completions/min_length": 1554.0, "completions/min_terminated_length": 1554.0, "entropy": 0.10694527812302113, "epoch": 0.00435, "frac_reward_zero_std": 0.0, "grad_norm": 0.6088537573814392, "kl": 0.6352979745715857, "learning_rate": 1.3602575132877464e-05, "loss": -0.0014, "num_tokens": 15980606.0, "reward": -0.2542678415775299, "reward_std": 0.22714872658252716, "rewards/rollout_reward_func/mean": -0.2542678415775299, "rewards/rollout_reward_func/std": 0.24382655322551727, "sampling/importance_sampling_ratio/max": 1.2562577724456787, "sampling/importance_sampling_ratio/mean": 1.0003743171691895, "sampling/importance_sampling_ratio/min": 0.8367946147918701, "sampling/sampling_logp_difference/max": 0.22813725471496582, "sampling/sampling_logp_difference/mean": 0.00479900510981679, "step": 435, "step_time": 27.928846239999984 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0027186761144548655, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005322842858731747, "entropy": 0.1066846689209342, "epoch": 0.00436, "grad_norm": 0.33773478865623474, "kl": 0.6049687564373016, "learning_rate": 1.3510877122599786e-05, "loss": -0.0025, "step": 436, "step_time": 6.710307776002082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1976.1875, "completions/mean_terminated_length": 1976.1875, "completions/min_length": 1669.0, "completions/min_terminated_length": 1669.0, "entropy": 0.13351814448833466, "epoch": 0.00437, "frac_reward_zero_std": 0.0, "grad_norm": 0.5789887309074402, "kl": 0.44297912530601025, "learning_rate": 1.3420482339447866e-05, "loss": -0.0027, "num_tokens": 16069454.0, "reward": -0.3774734139442444, "reward_std": 0.21577659249305725, "rewards/rollout_reward_func/mean": -0.3774734139442444, "rewards/rollout_reward_func/std": 0.2345554381608963, "sampling/importance_sampling_ratio/max": 1.5473421812057495, "sampling/importance_sampling_ratio/mean": 0.9998093843460083, "sampling/importance_sampling_ratio/min": 0.6823650598526001, "sampling/sampling_logp_difference/max": 0.4365386962890625, "sampling/sampling_logp_difference/mean": 0.008165208622813225, "step": 437, "step_time": 29.18878411699734 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.005263741128146648, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007895611692219973, "entropy": 0.13207246735692024, "epoch": 0.00438, "grad_norm": 0.39617395401000977, "kl": 0.4442360885441303, "learning_rate": 1.3331395189135924e-05, "loss": -0.0053, "step": 438, "step_time": 6.750917282000955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2144.0, "completions/max_terminated_length": 2144.0, "completions/mean_length": 1950.59375, "completions/mean_terminated_length": 1950.59375, "completions/min_length": 1807.0, "completions/min_terminated_length": 1807.0, "entropy": 0.1259512212127447, "epoch": 0.00439, "frac_reward_zero_std": 0.0, "grad_norm": 0.4923267066478729, "kl": 0.4903149511665106, "learning_rate": 1.324362001364597e-05, "loss": -0.0018, "num_tokens": 16157432.0, "reward": -0.2570919394493103, "reward_std": 0.25686028599739075, "rewards/rollout_reward_func/mean": -0.2570919394493103, "rewards/rollout_reward_func/std": 0.25481900572776794, "sampling/importance_sampling_ratio/max": 1.2473781108856201, "sampling/importance_sampling_ratio/mean": 0.9986450672149658, "sampling/importance_sampling_ratio/min": 0.7592068910598755, "sampling/sampling_logp_difference/max": 0.2754809856414795, "sampling/sampling_logp_difference/mean": 0.006806075107306242, "step": 439, "step_time": 28.437626820998048 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.0052926496136933565, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.00792452017776668, "entropy": 0.12483756244182587, "epoch": 0.0044, "grad_norm": 0.29948118329048157, "kl": 0.4569387435913086, "learning_rate": 1.3157161091016231e-05, "loss": -0.0034, "step": 440, "step_time": 6.6165893800025515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1908.03125, "completions/mean_terminated_length": 1908.03125, "completions/min_length": 1077.0, "completions/min_terminated_length": 1077.0, "entropy": 0.11330333724617958, "epoch": 0.00441, "frac_reward_zero_std": 0.0, "grad_norm": 0.5402713418006897, "kl": 0.656514760106802, "learning_rate": 1.307202263513262e-05, "loss": -0.0169, "num_tokens": 16244023.0, "reward": -0.2505173683166504, "reward_std": 0.2968015968799591, "rewards/rollout_reward_func/mean": -0.2505173683166504, "rewards/rollout_reward_func/std": 0.35491979122161865, "sampling/importance_sampling_ratio/max": 1.2446954250335693, "sampling/importance_sampling_ratio/mean": 1.00038480758667, "sampling/importance_sampling_ratio/min": 0.36310580372810364, "sampling/sampling_logp_difference/max": 1.0130610466003418, "sampling/sampling_logp_difference/mean": 0.00647688377648592, "step": 441, "step_time": 26.60117446699951 }, { "clip_ratio/high_max": 0.005437352228909731, "clip_ratio/high_mean": 0.0027186761144548655, "clip_ratio/low_mean": 0.005620059440843761, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008338735555298626, "entropy": 0.10917800106108189, "epoch": 0.00442, "grad_norm": 0.3215118944644928, "kl": 0.6582770608365536, "learning_rate": 1.2988208795523368e-05, "loss": -0.0185, "step": 442, "step_time": 7.39950187300019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2196.0, "completions/max_terminated_length": 2196.0, "completions/mean_length": 1936.78125, "completions/mean_terminated_length": 1936.78125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.12051359843462706, "epoch": 0.00443, "frac_reward_zero_std": 0.0, "grad_norm": 0.6043969988822937, "kl": 0.5161153972148895, "learning_rate": 1.2905723657156775e-05, "loss": -0.026, "num_tokens": 16331553.0, "reward": -0.35962921380996704, "reward_std": 0.24819818139076233, "rewards/rollout_reward_func/mean": -0.35962921380996704, "rewards/rollout_reward_func/std": 0.2570416033267975, "sampling/importance_sampling_ratio/max": 1.256624698638916, "sampling/importance_sampling_ratio/mean": 1.0012190341949463, "sampling/importance_sampling_ratio/min": 0.7734999060630798, "sampling/sampling_logp_difference/max": 0.25682973861694336, "sampling/sampling_logp_difference/mean": 0.006655033677816391, "step": 443, "step_time": 27.443395860003875 }, { "clip_ratio/high_max": 0.005321558099240065, "clip_ratio/high_mean": 0.0026607790496200323, "clip_ratio/low_mean": 0.0067461380967870355, "clip_ratio/low_min": 0.002659574383869767, "clip_ratio/region_mean": 0.009406917146407068, "entropy": 0.11694698687642813, "epoch": 0.00444, "grad_norm": 0.33141013979911804, "kl": 0.5222091823816299, "learning_rate": 1.2824571240242112e-05, "loss": -0.0285, "step": 444, "step_time": 7.247236041001088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2216.0, "completions/max_terminated_length": 2216.0, "completions/mean_length": 1986.75, "completions/mean_terminated_length": 1986.75, "completions/min_length": 1413.0, "completions/min_terminated_length": 1413.0, "entropy": 0.12127939239144325, "epoch": 0.00445, "frac_reward_zero_std": 0.0, "grad_norm": 0.5574556589126587, "kl": 0.3899236097931862, "learning_rate": 1.2744755500033685e-05, "loss": -0.0052, "num_tokens": 16420693.0, "reward": -0.38479283452033997, "reward_std": 0.2608441412448883, "rewards/rollout_reward_func/mean": -0.38479283452033997, "rewards/rollout_reward_func/std": 0.25972315669059753, "sampling/importance_sampling_ratio/max": 1.3037924766540527, "sampling/importance_sampling_ratio/mean": 0.9979789853096008, "sampling/importance_sampling_ratio/min": 0.5907281041145325, "sampling/sampling_logp_difference/max": 0.5263993740081787, "sampling/sampling_logp_difference/mean": 0.007131622638553381, "step": 445, "step_time": 28.747001276000447 }, { "clip_ratio/high_max": 0.013134058332070708, "clip_ratio/high_mean": 0.006567029166035354, "clip_ratio/low_mean": 0.005394345382228494, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.011961374548263848, "entropy": 0.11846609320491552, "epoch": 0.00446, "grad_norm": 0.42523178458213806, "kl": 0.38562365621328354, "learning_rate": 1.2666280326638078e-05, "loss": -0.0069, "step": 446, "step_time": 6.775098223999521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2158.0, "completions/max_terminated_length": 2158.0, "completions/mean_length": 1985.46875, "completions/mean_terminated_length": 1985.46875, "completions/min_length": 1855.0, "completions/min_terminated_length": 1855.0, "entropy": 0.10534087661653757, "epoch": 0.00447, "frac_reward_zero_std": 0.0, "grad_norm": 0.6470057368278503, "kl": 0.3893734812736511, "learning_rate": 1.2589149544824521e-05, "loss": 0.0011, "num_tokens": 16509797.0, "reward": -0.3199900984764099, "reward_std": 0.2851865887641907, "rewards/rollout_reward_func/mean": -0.3199900984764099, "rewards/rollout_reward_func/std": 0.28053978085517883, "sampling/importance_sampling_ratio/max": 1.21547532081604, "sampling/importance_sampling_ratio/mean": 0.9999539852142334, "sampling/importance_sampling_ratio/min": 0.7251470685005188, "sampling/sampling_logp_difference/max": 0.3213807940483093, "sampling/sampling_logp_difference/mean": 0.0062070610001683235, "step": 447, "step_time": 30.073813159000565 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 0.10271557793021202, "epoch": 0.00448, "grad_norm": 0.3989216089248657, "kl": 0.3854765221476555, "learning_rate": 1.251336691383851e-05, "loss": 0.0004, "step": 448, "step_time": 6.748306620000221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2222.0, "completions/max_terminated_length": 2222.0, "completions/mean_length": 2023.5, "completions/mean_terminated_length": 2023.5, "completions/min_length": 1867.0, "completions/min_terminated_length": 1867.0, "entropy": 0.10493952315300703, "epoch": 0.00449, "frac_reward_zero_std": 0.0, "grad_norm": 1.5771797895431519, "kl": 0.41039883717894554, "learning_rate": 1.2438936127218569e-05, "loss": 0.0022, "num_tokens": 16600171.0, "reward": -0.34410178661346436, "reward_std": 0.3472347557544708, "rewards/rollout_reward_func/mean": -0.34410178661346436, "rewards/rollout_reward_func/std": 0.3723931610584259, "sampling/importance_sampling_ratio/max": 2.7696821689605713, "sampling/importance_sampling_ratio/mean": 1.0035964250564575, "sampling/importance_sampling_ratio/min": 0.7858356833457947, "sampling/sampling_logp_difference/max": 1.0187325477600098, "sampling/sampling_logp_difference/mean": 0.00740304309874773, "step": 449, "step_time": 29.765667254001528 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.002659574383869767, "clip_ratio/low_mean": 0.002659574383869767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005319148767739534, "entropy": 0.10206926986575127, "epoch": 0.0045, "grad_norm": 0.42204853892326355, "kl": 0.44455499202013016, "learning_rate": 1.2365860812616227e-05, "loss": 0.0011, "step": 450, "step_time": 6.797776214003534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2152.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 1969.40625, "completions/mean_terminated_length": 1969.40625, "completions/min_length": 1832.0, "completions/min_terminated_length": 1832.0, "entropy": 0.09161322936415672, "epoch": 0.00451, "frac_reward_zero_std": 0.0, "grad_norm": 0.6289034485816956, "kl": 0.3792526666074991, "learning_rate": 1.2294144531619225e-05, "loss": 0.005, "num_tokens": 16688706.0, "reward": -0.3749367892742157, "reward_std": 0.20671427249908447, "rewards/rollout_reward_func/mean": -0.3749367892742157, "rewards/rollout_reward_func/std": 0.207978293299675, "sampling/importance_sampling_ratio/max": 1.1772856712341309, "sampling/importance_sampling_ratio/mean": 0.9990208148956299, "sampling/importance_sampling_ratio/min": 0.601029098033905, "sampling/sampling_logp_difference/max": 0.5091118812561035, "sampling/sampling_logp_difference/mean": 0.006169471424072981, "step": 451, "step_time": 28.043286738999086 }, { "clip_ratio/high_max": 0.00786790787242353, "clip_ratio/high_mean": 0.003933953936211765, "clip_ratio/low_mean": 0.003933953936211765, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00786790787242353, "entropy": 0.09180748090147972, "epoch": 0.00452, "grad_norm": 0.48425933718681335, "kl": 0.38290321454405785, "learning_rate": 1.2223790779577922e-05, "loss": 0.0031, "step": 452, "step_time": 7.389338656003019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2268.0, "completions/max_terminated_length": 2268.0, "completions/mean_length": 1976.6875, "completions/mean_terminated_length": 1976.6875, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "entropy": 0.09296826645731926, "epoch": 0.00453, "frac_reward_zero_std": 0.0, "grad_norm": 0.492795467376709, "kl": 0.4385579898953438, "learning_rate": 1.2154802985434939e-05, "loss": -0.0132, "num_tokens": 16777561.0, "reward": -0.3443661332130432, "reward_std": 0.28891798853874207, "rewards/rollout_reward_func/mean": -0.3443661332130432, "rewards/rollout_reward_func/std": 0.29089412093162537, "sampling/importance_sampling_ratio/max": 1.3902348279953003, "sampling/importance_sampling_ratio/mean": 0.9987429976463318, "sampling/importance_sampling_ratio/min": 0.4543963074684143, "sampling/sampling_logp_difference/max": 0.7887855768203735, "sampling/sampling_logp_difference/mean": 0.010194317437708378, "step": 453, "step_time": 28.144397118001507 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.007812500232830644, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.09342990489676595, "epoch": 0.00454, "grad_norm": 0.31741201877593994, "kl": 0.42704974859952927, "learning_rate": 1.2087184511558035e-05, "loss": -0.0152, "step": 454, "step_time": 7.399868163001884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2208.0, "completions/max_terminated_length": 2208.0, "completions/mean_length": 2008.40625, "completions/mean_terminated_length": 2008.40625, "completions/min_length": 1780.0, "completions/min_terminated_length": 1780.0, "entropy": 0.10969215352088213, "epoch": 0.00455, "frac_reward_zero_std": 0.0, "grad_norm": 0.476182222366333, "kl": 0.29175813868641853, "learning_rate": 1.2020938653576227e-05, "loss": 0.002, "num_tokens": 16867391.0, "reward": -0.3392788767814636, "reward_std": 0.2111305594444275, "rewards/rollout_reward_func/mean": -0.3392788767814636, "rewards/rollout_reward_func/std": 0.21159015595912933, "sampling/importance_sampling_ratio/max": 1.3500409126281738, "sampling/importance_sampling_ratio/mean": 1.0004421472549438, "sampling/importance_sampling_ratio/min": 0.8102636933326721, "sampling/sampling_logp_difference/max": 0.30013489723205566, "sampling/sampling_logp_difference/mean": 0.007756904698908329, "step": 455, "step_time": 28.48237173400048 }, { "clip_ratio/high_max": 0.007812500232830644, "clip_ratio/high_mean": 0.003906250116415322, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003906250116415322, "entropy": 0.10989444050937891, "epoch": 0.00456, "grad_norm": 0.4381773769855499, "kl": 0.28894236497581005, "learning_rate": 1.1956068640219182e-05, "loss": 0.0009, "step": 456, "step_time": 7.335507965000943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 1857.9375, "completions/mean_terminated_length": 1857.9375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.07234261929988861, "epoch": 0.00457, "frac_reward_zero_std": 0.0, "grad_norm": 0.2912171185016632, "kl": 0.26219843700528145, "learning_rate": 1.1892577633159827e-05, "loss": -0.0233, "num_tokens": 16952365.0, "reward": -0.22893117368221283, "reward_std": 0.3080277144908905, "rewards/rollout_reward_func/mean": -0.22893117368221283, "rewards/rollout_reward_func/std": 0.33692359924316406, "sampling/importance_sampling_ratio/max": 1.3593683242797852, "sampling/importance_sampling_ratio/mean": 1.0006065368652344, "sampling/importance_sampling_ratio/min": 0.536118745803833, "sampling/sampling_logp_difference/max": 0.6233996152877808, "sampling/sampling_logp_difference/mean": 0.0052309101447463036, "step": 457, "step_time": 28.274452127996483 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0026041667442768812, "clip_ratio/region_mean": 0.003906250116415322, "entropy": 0.07114090071991086, "epoch": 0.00458, "grad_norm": 0.22449766099452972, "kl": 0.2599310390651226, "learning_rate": 1.1830468726860275e-05, "loss": -0.0241, "step": 458, "step_time": 6.675185223999506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 1956.28125, "completions/mean_terminated_length": 1956.28125, "completions/min_length": 1212.0, "completions/min_terminated_length": 1212.0, "entropy": 0.07327756658196449, "epoch": 0.00459, "frac_reward_zero_std": 0.0, "grad_norm": 0.26922935247421265, "kl": 0.2865948434919119, "learning_rate": 1.1769744948420998e-05, "loss": 0.0003, "num_tokens": 17040548.0, "reward": -0.2646375894546509, "reward_std": 0.25270581245422363, "rewards/rollout_reward_func/mean": -0.2646375894546509, "rewards/rollout_reward_func/std": 0.25496357679367065, "sampling/importance_sampling_ratio/max": 1.4113478660583496, "sampling/importance_sampling_ratio/mean": 0.9987874031066895, "sampling/importance_sampling_ratio/min": 0.502673864364624, "sampling/sampling_logp_difference/max": 0.6878137588500977, "sampling/sampling_logp_difference/mean": 0.004707749001681805, "step": 459, "step_time": 28.83551998599978 }, { "clip_ratio/high_max": 0.002659574383869767, "clip_ratio/high_mean": 0.0013297871919348836, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013297871919348836, "entropy": 0.07254242058843374, "epoch": 0.0046, "grad_norm": 0.2555963099002838, "kl": 0.2881425116211176, "learning_rate": 1.1710409257433288e-05, "loss": 0.0003, "step": 460, "step_time": 6.678004288998636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2207.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 1931.0625, "completions/mean_terminated_length": 1931.0625, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "entropy": 0.09740176424384117, "epoch": 0.00461, "frac_reward_zero_std": 0.0, "grad_norm": 0.4644820988178253, "kl": 0.4026929475367069, "learning_rate": 1.1652464545835011e-05, "loss": -0.0131, "num_tokens": 17127912.0, "reward": -0.37963831424713135, "reward_std": 0.2695067524909973, "rewards/rollout_reward_func/mean": -0.37963831424713135, "rewards/rollout_reward_func/std": 0.2825929522514343, "sampling/importance_sampling_ratio/max": 1.3554491996765137, "sampling/importance_sampling_ratio/mean": 1.001615285873413, "sampling/importance_sampling_ratio/min": 0.7628933787345886, "sampling/sampling_logp_difference/max": 0.30413293838500977, "sampling/sampling_logp_difference/mean": 0.005544065497815609, "step": 461, "step_time": 29.940966695001407 }, { "clip_ratio/high_max": 0.005319148767739534, "clip_ratio/high_mean": 0.003989361692219973, "clip_ratio/low_mean": 0.0013297871919348836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005319148884154856, "entropy": 0.09503230825066566, "epoch": 0.00462, "grad_norm": 0.3174663782119751, "kl": 0.42478554137051105, "learning_rate": 1.159591363776967e-05, "loss": -0.0152, "step": 462, "step_time": 6.711516403001951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2023.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1893.46875, "completions/mean_terminated_length": 1893.46875, "completions/min_length": 1747.0, "completions/min_terminated_length": 1747.0, "entropy": 0.072243916336447, "epoch": 0.00463, "frac_reward_zero_std": 0.0, "grad_norm": 0.5355452299118042, "kl": 0.35659876838326454, "learning_rate": 1.1540759289448735e-05, "loss": 0.0029, "num_tokens": 17213967.0, "reward": -0.3611953556537628, "reward_std": 0.2977216839790344, "rewards/rollout_reward_func/mean": -0.3611953556537628, "rewards/rollout_reward_func/std": 0.29593896865844727, "sampling/importance_sampling_ratio/max": 1.2572087049484253, "sampling/importance_sampling_ratio/mean": 0.998875617980957, "sampling/importance_sampling_ratio/min": 0.6621916890144348, "sampling/sampling_logp_difference/max": 0.4122002124786377, "sampling/sampling_logp_difference/mean": 0.00560368038713932, "step": 463, "step_time": 28.89601301699986 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.003933954052627087, "clip_ratio/low_min": 0.002659574383869767, "clip_ratio/region_mean": 0.005236037424765527, "entropy": 0.07002271106466651, "epoch": 0.00464, "grad_norm": 0.22637081146240234, "kl": 0.3561781123280525, "learning_rate": 1.1487004189017333e-05, "loss": 0.0013, "step": 464, "step_time": 6.876917653004057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 1903.84375, "completions/mean_terminated_length": 1903.84375, "completions/min_length": 1373.0, "completions/min_terminated_length": 1373.0, "entropy": 0.06557762064039707, "epoch": 0.00465, "frac_reward_zero_std": 0.0, "grad_norm": 0.6694150567054749, "kl": 0.36426056921482086, "learning_rate": 1.1434650956423223e-05, "loss": 0.0058, "num_tokens": 17300371.0, "reward": -0.22558437287807465, "reward_std": 0.35350215435028076, "rewards/rollout_reward_func/mean": -0.22558437287807465, "rewards/rollout_reward_func/std": 0.42270010709762573, "sampling/importance_sampling_ratio/max": 1.2563961744308472, "sampling/importance_sampling_ratio/mean": 1.0000178813934326, "sampling/importance_sampling_ratio/min": 0.7392328381538391, "sampling/sampling_logp_difference/max": 0.30214229226112366, "sampling/sampling_logp_difference/mean": 0.00419003376737237, "step": 465, "step_time": 29.057112568001685 }, { "clip_ratio/high_max": 0.00786790787242353, "clip_ratio/high_mean": 0.003933953936211765, "clip_ratio/low_mean": 0.006567029166035354, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010500983102247119, "entropy": 0.06391376443207264, "epoch": 0.00466, "grad_norm": 0.33803799748420715, "kl": 0.3622515555471182, "learning_rate": 1.1383702143289104e-05, "loss": 0.0031, "step": 466, "step_time": 7.496383542003969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2242.0, "completions/max_terminated_length": 2242.0, "completions/mean_length": 1941.53125, "completions/mean_terminated_length": 1941.53125, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "entropy": 0.08016159012913704, "epoch": 0.00467, "frac_reward_zero_std": 0.0, "grad_norm": 0.8689066767692566, "kl": 1.0717766247689724, "learning_rate": 1.1334160232788248e-05, "loss": -0.0032, "num_tokens": 17388103.0, "reward": -0.2896825969219208, "reward_std": 0.2694796025753021, "rewards/rollout_reward_func/mean": -0.2896825969219208, "rewards/rollout_reward_func/std": 0.286941260099411, "sampling/importance_sampling_ratio/max": 1.4595797061920166, "sampling/importance_sampling_ratio/mean": 0.9992034435272217, "sampling/importance_sampling_ratio/min": 0.4930723309516907, "sampling/sampling_logp_difference/max": 0.707099437713623, "sampling/sampling_logp_difference/mean": 0.007459389977157116, "step": 467, "step_time": 27.404616480000186 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003933953936211765, "entropy": 0.07852241187356412, "epoch": 0.00468, "grad_norm": 0.5788558125495911, "kl": 0.8407633155584335, "learning_rate": 1.128602763952348e-05, "loss": -0.0065, "step": 468, "step_time": 6.870010152000759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2125.0, "completions/max_terminated_length": 2125.0, "completions/mean_length": 1948.46875, "completions/mean_terminated_length": 1948.46875, "completions/min_length": 1629.0, "completions/min_terminated_length": 1629.0, "entropy": 0.07986621651798487, "epoch": 0.00469, "frac_reward_zero_std": 0.0, "grad_norm": 0.7844597101211548, "kl": 0.3903021663427353, "learning_rate": 1.1239306709409491e-05, "loss": -0.0043, "num_tokens": 17476008.0, "reward": -0.30433809757232666, "reward_std": 0.34596896171569824, "rewards/rollout_reward_func/mean": -0.30433809757232666, "rewards/rollout_reward_func/std": 0.38398435711860657, "sampling/importance_sampling_ratio/max": 1.3512990474700928, "sampling/importance_sampling_ratio/mean": 0.9998942613601685, "sampling/importance_sampling_ratio/min": 0.4108227491378784, "sampling/sampling_logp_difference/max": 0.8895934820175171, "sampling/sampling_logp_difference/mean": 0.007383827120065689, "step": 469, "step_time": 27.71274685500248 }, { "clip_ratio/high_max": 0.013358098221942782, "clip_ratio/high_mean": 0.006679049110971391, "clip_ratio/low_mean": 0.002631870564073324, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009310919675044715, "entropy": 0.07851193565875292, "epoch": 0.0047, "grad_norm": 0.3075670301914215, "kl": 0.38931426778435707, "learning_rate": 1.1193999719558504e-05, "loss": -0.0054, "step": 470, "step_time": 7.342923314998188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 1977.75, "completions/mean_terminated_length": 1977.75, "completions/min_length": 1791.0, "completions/min_terminated_length": 1791.0, "entropy": 0.06952641764655709, "epoch": 0.00471, "frac_reward_zero_std": 0.0, "grad_norm": 0.6098089814186096, "kl": 0.3800669461488724, "learning_rate": 1.115010887816928e-05, "loss": -0.0019, "num_tokens": 17564835.0, "reward": -0.19059479236602783, "reward_std": 0.4257424473762512, "rewards/rollout_reward_func/mean": -0.19059479236602783, "rewards/rollout_reward_func/std": 0.4242932200431824, "sampling/importance_sampling_ratio/max": 1.139782190322876, "sampling/importance_sampling_ratio/mean": 0.9996844530105591, "sampling/importance_sampling_ratio/min": 0.6485833525657654, "sampling/sampling_logp_difference/max": 0.4329648017883301, "sampling/sampling_logp_difference/mean": 0.0047391075640916824, "step": 471, "step_time": 27.68122412700177 }, { "clip_ratio/high_max": 0.00786790787242353, "clip_ratio/high_mean": 0.003933953936211765, "clip_ratio/low_mean": 0.003906250116415322, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007840204052627087, "entropy": 0.06872328231111169, "epoch": 0.00472, "grad_norm": 0.33254748582839966, "kl": 0.37852997332811356, "learning_rate": 1.1107636324419513e-05, "loss": -0.0032, "step": 472, "step_time": 6.716563785003018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 2014.59375, "completions/mean_terminated_length": 2014.59375, "completions/min_length": 1821.0, "completions/min_terminated_length": 1821.0, "entropy": 0.07025853218510747, "epoch": 0.00473, "frac_reward_zero_std": 0.0, "grad_norm": 0.5066038966178894, "kl": 0.2954424377530813, "learning_rate": 1.1066584128361551e-05, "loss": -0.002, "num_tokens": 17654920.0, "reward": -0.3004554510116577, "reward_std": 0.20028242468833923, "rewards/rollout_reward_func/mean": -0.3004554510116577, "rewards/rollout_reward_func/std": 0.25992223620414734, "sampling/importance_sampling_ratio/max": 1.2451509237289429, "sampling/importance_sampling_ratio/mean": 0.9981347918510437, "sampling/importance_sampling_ratio/min": 0.2748691737651825, "sampling/sampling_logp_difference/max": 1.2914600372314453, "sampling/sampling_logp_difference/mean": 0.00820106826722622, "step": 473, "step_time": 27.51574054899902 }, { "clip_ratio/high_max": 0.010527482256293297, "clip_ratio/high_mean": 0.005263741128146648, "clip_ratio/low_mean": 0.003933953936211765, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009197695064358413, "entropy": 0.06982908584177494, "epoch": 0.00474, "grad_norm": 0.3839394152164459, "kl": 0.2925265561789274, "learning_rate": 1.1026954290821512e-05, "loss": -0.0035, "step": 474, "step_time": 7.172233242998118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 2009.65625, "completions/mean_terminated_length": 2009.65625, "completions/min_length": 1880.0, "completions/min_terminated_length": 1880.0, "entropy": 0.08056351263076067, "epoch": 0.00475, "frac_reward_zero_std": 0.0, "grad_norm": 0.825448215007782, "kl": 0.9064167514443398, "learning_rate": 1.098874874330177e-05, "loss": 0.0039, "num_tokens": 17744826.0, "reward": -0.3195793032646179, "reward_std": 0.16823403537273407, "rewards/rollout_reward_func/mean": -0.3195793032646179, "rewards/rollout_reward_func/std": 0.16860099136829376, "sampling/importance_sampling_ratio/max": 1.410538673400879, "sampling/importance_sampling_ratio/mean": 0.9992801547050476, "sampling/importance_sampling_ratio/min": 0.6550285220146179, "sampling/sampling_logp_difference/max": 0.4230765104293823, "sampling/sampling_logp_difference/mean": 0.005919348448514938, "step": 475, "step_time": 29.317023166999206 }, { "clip_ratio/high_max": 0.005263741128146648, "clip_ratio/high_mean": 0.002631870564073324, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005236037308350205, "entropy": 0.08109111059457064, "epoch": 0.00476, "grad_norm": 0.6433701515197754, "kl": 0.7871890068054199, "learning_rate": 1.095196934788681e-05, "loss": 0.002, "step": 476, "step_time": 6.694667873000071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 2011.03125, "completions/mean_terminated_length": 2011.03125, "completions/min_length": 1775.0, "completions/min_terminated_length": 1775.0, "entropy": 0.07260708510875702, "epoch": 0.00477, "frac_reward_zero_std": 0.0, "grad_norm": 0.5816273093223572, "kl": 0.37016328424215317, "learning_rate": 1.0916617897152478e-05, "loss": 0.0006, "num_tokens": 17834780.0, "reward": -0.25405603647232056, "reward_std": 0.2481149435043335, "rewards/rollout_reward_func/mean": -0.25405603647232056, "rewards/rollout_reward_func/std": 0.25024524331092834, "sampling/importance_sampling_ratio/max": 1.3614604473114014, "sampling/importance_sampling_ratio/mean": 0.9983688592910767, "sampling/importance_sampling_ratio/min": 0.5238931179046631, "sampling/sampling_logp_difference/max": 0.6464675068855286, "sampling/sampling_logp_difference/mean": 0.006948560010641813, "step": 477, "step_time": 28.399781508998785 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.07190942065790296, "epoch": 0.00478, "grad_norm": 0.41811370849609375, "kl": 0.36808030121028423, "learning_rate": 1.0882696114078608e-05, "loss": 0.0013, "step": 478, "step_time": 6.749917467999694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 1978.625, "completions/mean_terminated_length": 1978.625, "completions/min_length": 1866.0, "completions/min_terminated_length": 1866.0, "entropy": 0.06799233751371503, "epoch": 0.00479, "frac_reward_zero_std": 0.0, "grad_norm": 0.6093500852584839, "kl": 0.4936750531196594, "learning_rate": 1.0850205651965054e-05, "loss": 0.0044, "num_tokens": 17923671.0, "reward": -0.34733691811561584, "reward_std": 0.2586010694503784, "rewards/rollout_reward_func/mean": -0.34733691811561584, "rewards/rollout_reward_func/std": 0.2629219591617584, "sampling/importance_sampling_ratio/max": 1.2096679210662842, "sampling/importance_sampling_ratio/mean": 0.9984405040740967, "sampling/importance_sampling_ratio/min": 0.43276605010032654, "sampling/sampling_logp_difference/max": 0.8375580310821533, "sampling/sampling_logp_difference/mean": 0.006006847135722637, "step": 479, "step_time": 29.36835067800166 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0053203534334897995, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00792452017776668, "entropy": 0.06670363014563918, "epoch": 0.0048, "grad_norm": 0.36310523748397827, "kl": 0.4838133305311203, "learning_rate": 1.0819148094351096e-05, "loss": 0.0027, "step": 480, "step_time": 6.691241860999071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 2279.65625, "completions/mean_terminated_length": 2279.65625, "completions/min_length": 1742.0, "completions/min_terminated_length": 1742.0, "entropy": 0.07509864261373878, "epoch": 0.00481, "frac_reward_zero_std": 0.0, "grad_norm": 0.43689459562301636, "kl": 0.40509631484746933, "learning_rate": 1.0789524954938282e-05, "loss": 0.0059, "num_tokens": 18022158.0, "reward": -0.20613950490951538, "reward_std": 0.28310006856918335, "rewards/rollout_reward_func/mean": -0.20613950490951538, "rewards/rollout_reward_func/std": 0.31597083806991577, "sampling/importance_sampling_ratio/max": 1.937630534172058, "sampling/importance_sampling_ratio/mean": 1.0006837844848633, "sampling/importance_sampling_ratio/min": 0.7006114721298218, "sampling/sampling_logp_difference/max": 0.6614658832550049, "sampling/sampling_logp_difference/mean": 0.00641980255022645, "step": 481, "step_time": 31.873006665002322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001179245300590992, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001179245300590992, "entropy": 0.07509506680071354, "epoch": 0.00482, "grad_norm": 0.4134581685066223, "kl": 0.4124436844140291, "learning_rate": 1.0761337677516632e-05, "loss": 0.0046, "step": 482, "step_time": 7.294292765996943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 2316.0625, "completions/mean_terminated_length": 2316.0625, "completions/min_length": 2123.0, "completions/min_terminated_length": 2123.0, "entropy": 0.06688045803457499, "epoch": 0.00483, "frac_reward_zero_std": 0.0, "grad_norm": 0.6008337140083313, "kl": 0.29455845803022385, "learning_rate": 1.0734587635894285e-05, "loss": -0.004, "num_tokens": 18121835.0, "reward": -0.3050263524055481, "reward_std": 0.2641700208187103, "rewards/rollout_reward_func/mean": -0.3050263524055481, "rewards/rollout_reward_func/std": 0.28118613362312317, "sampling/importance_sampling_ratio/max": 1.937331199645996, "sampling/importance_sampling_ratio/mean": 1.0002374649047852, "sampling/importance_sampling_ratio/min": 0.5394382476806641, "sampling/sampling_logp_difference/max": 0.6613113880157471, "sampling/sampling_logp_difference/mean": 0.00788648147135973, "step": 483, "step_time": 32.40096225599882 }, { "clip_ratio/high_max": 0.009262435603886843, "clip_ratio/high_mean": 0.004631217801943421, "clip_ratio/low_mean": 0.005855119903571904, "clip_ratio/low_min": 0.002314814832061529, "clip_ratio/region_mean": 0.010486337705515325, "entropy": 0.06589845474809408, "epoch": 0.00484, "grad_norm": 0.5266754031181335, "kl": 0.2963740676641464, "learning_rate": 1.0709276133830523e-05, "loss": -0.0055, "step": 484, "step_time": 8.118802454002434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2543.0, "completions/max_terminated_length": 2543.0, "completions/mean_length": 2299.375, "completions/mean_terminated_length": 2299.375, "completions/min_length": 2137.0, "completions/min_terminated_length": 2137.0, "entropy": 0.06779825687408447, "epoch": 0.00485, "frac_reward_zero_std": 0.0, "grad_norm": 0.5387915968894958, "kl": 0.3514415882527828, "learning_rate": 1.0685404404972256e-05, "loss": 0.002, "num_tokens": 18220960.0, "reward": -0.23470231890678406, "reward_std": 0.20318783819675446, "rewards/rollout_reward_func/mean": -0.23470231890678406, "rewards/rollout_reward_func/std": 0.19751466810703278, "sampling/importance_sampling_ratio/max": 1.1726657152175903, "sampling/importance_sampling_ratio/mean": 0.9985332489013672, "sampling/importance_sampling_ratio/min": 0.33416879177093506, "sampling/sampling_logp_difference/max": 1.0961090326309204, "sampling/sampling_logp_difference/mean": 0.0065994830802083015, "step": 485, "step_time": 31.741606798999783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009325612802058458, "clip_ratio/low_min": 0.002358490601181984, "clip_ratio/region_mean": 0.009325612802058458, "entropy": 0.06671161064878106, "epoch": 0.00486, "grad_norm": 0.30565258860588074, "kl": 0.3549097888171673, "learning_rate": 1.0662973612793867e-05, "loss": 0.0, "step": 486, "step_time": 7.53108921099556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2404.0, "completions/max_terminated_length": 2404.0, "completions/mean_length": 2237.5625, "completions/mean_terminated_length": 2237.5625, "completions/min_length": 1683.0, "completions/min_terminated_length": 1683.0, "entropy": 0.05996994022279978, "epoch": 0.00487, "frac_reward_zero_std": 0.0, "grad_norm": 0.755391001701355, "kl": 0.33826661854982376, "learning_rate": 1.0641984850540521e-05, "loss": -0.0055, "num_tokens": 18318047.0, "reward": -0.21868684887886047, "reward_std": 0.29433852434158325, "rewards/rollout_reward_func/mean": -0.21868684887886047, "rewards/rollout_reward_func/std": 0.29288750886917114, "sampling/importance_sampling_ratio/max": 3.0, "sampling/importance_sampling_ratio/mean": 1.0016038417816162, "sampling/importance_sampling_ratio/min": 0.6618261933326721, "sampling/sampling_logp_difference/max": 1.3770461082458496, "sampling/sampling_logp_difference/mean": 0.005781671963632107, "step": 487, "step_time": 31.780545092000466 }, { "clip_ratio/high_max": 0.00733287981711328, "clip_ratio/high_mean": 0.00366643990855664, "clip_ratio/low_mean": 0.0023366527166217566, "clip_ratio/low_min": 0.002314814832061529, "clip_ratio/region_mean": 0.006003092625178397, "entropy": 0.059521961491554976, "epoch": 0.00488, "grad_norm": 0.3168330788612366, "kl": 0.33975275233387947, "learning_rate": 1.0622439141174882e-05, "loss": -0.0069, "step": 488, "step_time": 7.748022366999066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 2165.25, "completions/mean_terminated_length": 2165.25, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.06874157302081585, "epoch": 0.00489, "frac_reward_zero_std": 0.0, "grad_norm": 0.6277678608894348, "kl": 0.3725462891161442, "learning_rate": 1.0604337437327243e-05, "loss": -0.0483, "num_tokens": 18412883.0, "reward": -0.31094127893447876, "reward_std": 0.267703115940094, "rewards/rollout_reward_func/mean": -0.31094127893447876, "rewards/rollout_reward_func/std": 0.28639477491378784, "sampling/importance_sampling_ratio/max": 1.2746199369430542, "sampling/importance_sampling_ratio/mean": 0.9997426867485046, "sampling/importance_sampling_ratio/min": 0.6180114150047302, "sampling/sampling_logp_difference/max": 0.4812483787536621, "sampling/sampling_logp_difference/mean": 0.006199668161571026, "step": 489, "step_time": 32.376150318996224 }, { "clip_ratio/high_max": 0.002314814832061529, "clip_ratio/high_mean": 0.0011574074160307646, "clip_ratio/low_mean": 0.0011574074160307646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002314814832061529, "entropy": 0.06940048886463046, "epoch": 0.0049, "grad_norm": 0.5937971472740173, "kl": 0.3717838767915964, "learning_rate": 1.0587680621249123e-05, "loss": -0.0484, "step": 490, "step_time": 7.362742343997525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2554.0, "completions/max_terminated_length": 2554.0, "completions/mean_length": 2309.59375, "completions/mean_terminated_length": 2309.59375, "completions/min_length": 2087.0, "completions/min_terminated_length": 2087.0, "entropy": 0.06788519537076354, "epoch": 0.00491, "frac_reward_zero_std": 0.0, "grad_norm": 0.6532019972801208, "kl": 0.37320613116025925, "learning_rate": 1.057246950477023e-05, "loss": 0.0071, "num_tokens": 18512343.0, "reward": -0.27413326501846313, "reward_std": 0.22158397734165192, "rewards/rollout_reward_func/mean": -0.27413326501846313, "rewards/rollout_reward_func/std": 0.2453901618719101, "sampling/importance_sampling_ratio/max": 1.256028652191162, "sampling/importance_sampling_ratio/mean": 1.0006215572357178, "sampling/importance_sampling_ratio/min": 0.40615034103393555, "sampling/sampling_logp_difference/max": 0.9010319709777832, "sampling/sampling_logp_difference/mean": 0.00668709259480238, "step": 491, "step_time": 33.10035168000286 }, { "clip_ratio/high_max": 0.007031796034425497, "clip_ratio/high_mean": 0.0035158980172127485, "clip_ratio/low_mean": 0.005830712849274278, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009346610866487026, "entropy": 0.06849220162257552, "epoch": 0.00492, "grad_norm": 0.45474696159362793, "kl": 0.3639277648180723, "learning_rate": 1.055870482925892e-05, "loss": 0.0042, "step": 492, "step_time": 7.490408559997377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 2315.875, "completions/mean_terminated_length": 2315.875, "completions/min_length": 2147.0, "completions/min_terminated_length": 2147.0, "entropy": 0.07049377402290702, "epoch": 0.00493, "frac_reward_zero_std": 0.0, "grad_norm": 0.6827453374862671, "kl": 0.5675771497189999, "learning_rate": 1.0546387265586063e-05, "loss": 0.0058, "num_tokens": 18611992.0, "reward": -0.2381698191165924, "reward_std": 0.24713721871376038, "rewards/rollout_reward_func/mean": -0.2381698191165924, "rewards/rollout_reward_func/std": 0.2714662253856659, "sampling/importance_sampling_ratio/max": 1.2416174411773682, "sampling/importance_sampling_ratio/mean": 1.0006303787231445, "sampling/importance_sampling_ratio/min": 0.7806032299995422, "sampling/sampling_logp_difference/max": 0.24768829345703125, "sampling/sampling_logp_difference/mean": 0.005715197417885065, "step": 493, "step_time": 32.83830881300128 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.004629629664123058, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008101851912215352, "entropy": 0.0715000731870532, "epoch": 0.00494, "grad_norm": 0.5431090593338013, "kl": 0.5466551780700684, "learning_rate": 1.0535517414092337e-05, "loss": 0.003, "step": 494, "step_time": 8.099107970998375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2347.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 2121.28125, "completions/mean_terminated_length": 2121.28125, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "entropy": 0.05875223409384489, "epoch": 0.00495, "frac_reward_zero_std": 0.0, "grad_norm": 0.648433268070221, "kl": 0.3230237290263176, "learning_rate": 1.0526095804558962e-05, "loss": -0.0601, "num_tokens": 18705374.0, "reward": -0.3294994831085205, "reward_std": 0.2809543311595917, "rewards/rollout_reward_func/mean": -0.3294994831085205, "rewards/rollout_reward_func/std": 0.31239011883735657, "sampling/importance_sampling_ratio/max": 1.4172496795654297, "sampling/importance_sampling_ratio/mean": 0.9999298453330994, "sampling/importance_sampling_ratio/min": 0.6750344634056091, "sampling/sampling_logp_difference/max": 0.3929915428161621, "sampling/sampling_logp_difference/mean": 0.005393929313868284, "step": 495, "step_time": 29.835909386998537 }, { "clip_ratio/high_max": 0.002358490601181984, "clip_ratio/high_mean": 0.001179245300590992, "clip_ratio/low_mean": 0.002358490601181984, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003537735901772976, "entropy": 0.060209551360458136, "epoch": 0.00496, "grad_norm": 0.5795237421989441, "kl": 0.3232000954449177, "learning_rate": 1.0518122896181899e-05, "loss": -0.0597, "step": 496, "step_time": 7.082710956001392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2380.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 2189.4375, "completions/mean_terminated_length": 2189.4375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.07345778588205576, "epoch": 0.00497, "frac_reward_zero_std": 0.0, "grad_norm": 0.5517545938491821, "kl": 0.39771996811032295, "learning_rate": 1.0511599077549456e-05, "loss": -0.0476, "num_tokens": 18800961.0, "reward": -0.28739064931869507, "reward_std": 0.26829561591148376, "rewards/rollout_reward_func/mean": -0.28739064931869507, "rewards/rollout_reward_func/std": 0.28000959753990173, "sampling/importance_sampling_ratio/max": 1.422694444656372, "sampling/importance_sampling_ratio/mean": 0.9996083378791809, "sampling/importance_sampling_ratio/min": 0.5926069617271423, "sampling/sampling_logp_difference/max": 0.5232239365577698, "sampling/sampling_logp_difference/mean": 0.00792843010276556, "step": 497, "step_time": 30.5980185899989 }, { "clip_ratio/high_max": 0.01401991629973054, "clip_ratio/high_mean": 0.00700995814986527, "clip_ratio/low_mean": 0.003494060132652521, "clip_ratio/low_min": 0.002314814832061529, "clip_ratio/region_mean": 0.01050401828251779, "entropy": 0.07358254073187709, "epoch": 0.00498, "grad_norm": 0.42298614978790283, "kl": 0.38711633905768394, "learning_rate": 1.0506524666623345e-05, "loss": -0.0486, "step": 498, "step_time": 8.338743867996527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2509.0, "completions/max_terminated_length": 2509.0, "completions/mean_length": 2321.28125, "completions/mean_terminated_length": 2321.28125, "completions/min_length": 2127.0, "completions/min_terminated_length": 2127.0, "entropy": 0.06716568442061543, "epoch": 0.00499, "frac_reward_zero_std": 0.0, "grad_norm": 0.9311619997024536, "kl": 0.33122266083955765, "learning_rate": 1.0502899910723197e-05, "loss": 0.0008, "num_tokens": 18900785.0, "reward": -0.25196215510368347, "reward_std": 0.19783714413642883, "rewards/rollout_reward_func/mean": -0.25196215510368347, "rewards/rollout_reward_func/std": 0.2177085131406784, "sampling/importance_sampling_ratio/max": 1.328439474105835, "sampling/importance_sampling_ratio/mean": 0.9987441897392273, "sampling/importance_sampling_ratio/min": 0.6752278208732605, "sampling/sampling_logp_difference/max": 0.39270520210266113, "sampling/sampling_logp_difference/mean": 0.006362831220030785, "step": 499, "step_time": 32.47582928200245 }, { "clip_ratio/high_max": 0.009302935097366571, "clip_ratio/high_mean": 0.005853390553966165, "clip_ratio/low_mean": 0.002358490601181984, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008211881155148149, "entropy": 0.06805370608344674, "epoch": 0.005, "grad_norm": 0.5345240831375122, "kl": 0.3248710446059704, "learning_rate": 1.0500724986514505e-05, "loss": -0.0022, "step": 500, "step_time": 7.4551438560010865 } ], "logging_steps": 1.0, "max_steps": 500, "num_input_tokens_seen": 18900785, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }