{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03181066293421555, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0010895023006014526, "clip_ratio/high_mean": 0.00019086438669546624, "clip_ratio/low_mean": 0.00020682324338849867, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039768761962477583, "completions/clipped_ratio": 0.5892857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 1388.3482666015625, "completions/mean_terminated_length": 441.8913269042969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4731306731700897, "epoch": 0.0001272426517368622, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2442060261964798, "kl": 0.00014945304610591847, "learning_rate": 1e-05, "loss": -0.0653, "num_tokens": 183967.0, "reward": 0.29360121488571167, "reward_std": 0.3095513880252838, "rewards/unified_reward/mean": 0.29360121488571167, "rewards/unified_reward/std": 0.3095513582229614, "sampling/importance_sampling_ratio/max": 2.9140872955322266, "sampling/importance_sampling_ratio/mean": 0.5773221254348755, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.561802864074707, "sampling/sampling_logp_difference/mean": 0.01908954605460167, "step": 1, "step_time": 43.31057964311913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5178571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1314.169677734375, "completions/mean_terminated_length": 525.9815063476562, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 1.6711605787277222, "epoch": 0.0002544853034737244, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.19904133677482605, "kl": 0.0010994108451995999, "learning_rate": 9.998727573482632e-06, "loss": 0.0054, "num_tokens": 351754.0, "reward": 0.23794643580913544, "reward_std": 0.3876032829284668, "rewards/unified_reward/mean": 0.23794642090797424, "rewards/unified_reward/std": 0.3876032829284668, "sampling/importance_sampling_ratio/max": 2.7656195163726807, "sampling/importance_sampling_ratio/mean": 0.6358256936073303, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8684720993041992, "sampling/sampling_logp_difference/mean": 0.022221259772777557, "step": 2, "step_time": 37.79395275609568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.681559741788078e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.681559741788078e-06, "completions/clipped_ratio": 0.5892857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1624.0, "completions/mean_length": 1422.696533203125, "completions/mean_terminated_length": 525.521728515625, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 1.526512235403061, "epoch": 0.00038172795521058657, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.19431056082248688, "kl": 0.0020864024409092963, "learning_rate": 9.997455146965264e-06, "loss": -0.0646, "num_tokens": 551232.0, "reward": 0.26726192235946655, "reward_std": 0.353963166475296, "rewards/unified_reward/mean": 0.26726192235946655, "rewards/unified_reward/std": 0.353963166475296, "sampling/importance_sampling_ratio/max": 2.9633240699768066, "sampling/importance_sampling_ratio/mean": 0.5286582112312317, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8864383697509766, "sampling/sampling_logp_difference/mean": 0.0206074770539999, "step": 3, "step_time": 46.12497856281698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5892857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 1443.169677734375, "completions/mean_terminated_length": 575.3695678710938, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.094785064458847, "epoch": 0.0005089706069474488, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.20012891292572021, "kl": 0.002044647728325799, "learning_rate": 9.996182720447895e-06, "loss": -0.059, "num_tokens": 761507.0, "reward": 0.25282740592956543, "reward_std": 0.2923694849014282, "rewards/unified_reward/mean": 0.25282737612724304, "rewards/unified_reward/std": 0.2923694849014282, "sampling/importance_sampling_ratio/max": 2.3804643154144287, "sampling/importance_sampling_ratio/mean": 0.5519693493843079, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6849451065063477, "sampling/sampling_logp_difference/mean": 0.01579233631491661, "step": 4, "step_time": 51.02434944710694 }, { "clip_ratio/high_max": 0.0008929700416047126, "clip_ratio/high_mean": 0.00012756714568240568, "clip_ratio/low_mean": 2.0237590433680452e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00014780473611608613, "completions/clipped_ratio": 0.5803571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1425.419677734375, "completions/mean_terminated_length": 564.4042358398438, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 0.9887424111366272, "epoch": 0.000636213258684311, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.15432791411876678, "kl": 0.004873804748058319, "learning_rate": 9.994910293930526e-06, "loss": -0.0548, "num_tokens": 964186.0, "reward": 0.3898809850215912, "reward_std": 0.35516607761383057, "rewards/unified_reward/mean": 0.3898809552192688, "rewards/unified_reward/std": 0.35516607761383057, "sampling/importance_sampling_ratio/max": 2.5087838172912598, "sampling/importance_sampling_ratio/mean": 0.5148612260818481, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4974842071533203, "sampling/sampling_logp_difference/mean": 0.016274379566311836, "step": 5, "step_time": 52.308474719990045 }, { "clip_ratio/high_max": 0.000274658203125, "clip_ratio/high_mean": 3.923688564100303e-05, "clip_ratio/low_mean": 5.1644504310388584e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 9.088138995139161e-05, "completions/clipped_ratio": 0.6696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 1485.8035888671875, "completions/mean_terminated_length": 346.2162170410156, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 1.6774126887321472, "epoch": 0.0007634559104211731, "frac_reward_zero_std": 0.5, "grad_norm": 0.16085875034332275, "kl": 0.006818469846621156, "learning_rate": 9.993637867413158e-06, "loss": 0.016, "num_tokens": 1172284.0, "reward": 0.2217262089252472, "reward_std": 0.3062482476234436, "rewards/unified_reward/mean": 0.221726194024086, "rewards/unified_reward/std": 0.3062482476234436, "sampling/importance_sampling_ratio/max": 2.8437857627868652, "sampling/importance_sampling_ratio/mean": 0.5471864938735962, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9012317657470703, "sampling/sampling_logp_difference/mean": 0.020371390506625175, "step": 6, "step_time": 43.567426577210426 }, { "clip_ratio/high_max": 0.002169391082134098, "clip_ratio/high_mean": 0.00030991301900940016, "clip_ratio/low_mean": 1.7438616396248108e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003273516267654486, "completions/clipped_ratio": 0.5714285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 1371.8929443359375, "completions/mean_terminated_length": 470.41668701171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.258190393447876, "epoch": 0.0008906985621580354, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.44330862164497375, "kl": 0.008012521080672741, "learning_rate": 9.99236544089579e-06, "loss": 0.086, "num_tokens": 1352424.0, "reward": 0.29181548953056335, "reward_std": 0.25352367758750916, "rewards/unified_reward/mean": 0.29181548953056335, "rewards/unified_reward/std": 0.25352367758750916, "sampling/importance_sampling_ratio/max": 2.881922721862793, "sampling/importance_sampling_ratio/mean": 0.46588850021362305, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5458947420120239, "sampling/sampling_logp_difference/mean": 0.01862291432917118, "step": 7, "step_time": 38.97348318528384 }, { "clip_ratio/high_max": 0.0001624972210265696, "clip_ratio/high_mean": 4.0550919493398396e-05, "clip_ratio/low_mean": 2.2909466679266188e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.346038662741194e-05, "completions/clipped_ratio": 0.660714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 1435.6876220703125, "completions/mean_terminated_length": 243.2894744873047, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 1.777740329504013, "epoch": 0.0010179412138948975, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.29478591680526733, "kl": 0.01137150195427239, "learning_rate": 9.99109301437842e-06, "loss": -0.017, "num_tokens": 1537173.0, "reward": 0.22395837306976318, "reward_std": 0.28868189454078674, "rewards/unified_reward/mean": 0.223958358168602, "rewards/unified_reward/std": 0.28868192434310913, "sampling/importance_sampling_ratio/max": 2.6023385524749756, "sampling/importance_sampling_ratio/mean": 0.5634905099868774, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.566436767578125, "sampling/sampling_logp_difference/mean": 0.019680092111229897, "step": 8, "step_time": 39.84304001694545 }, { "clip_ratio/high_max": 0.000457763671875, "clip_ratio/high_mean": 0.00010574993211776018, "clip_ratio/low_mean": 7.115982953109778e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017690976892481558, "completions/clipped_ratio": 0.7321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1395.0, "completions/mean_length": 1640.2410888671875, "completions/mean_terminated_length": 525.7000122070312, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.081162840127945, "epoch": 0.0011451838656317598, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.17604771256446838, "kl": 0.012916217790916562, "learning_rate": 9.989820587861052e-06, "loss": 0.0748, "num_tokens": 1748832.0, "reward": 0.25, "reward_std": 0.32879796624183655, "rewards/unified_reward/mean": 0.25, "rewards/unified_reward/std": 0.32879796624183655, "sampling/importance_sampling_ratio/max": 2.891925573348999, "sampling/importance_sampling_ratio/mean": 0.591586172580719, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6674716472625732, "sampling/sampling_logp_difference/mean": 0.015860356390476227, "step": 9, "step_time": 42.12788641708903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 1549.33935546875, "completions/mean_terminated_length": 302.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2904799282550812, "epoch": 0.001272426517368622, "frac_reward_zero_std": 0.6428571939468384, "grad_norm": 0.14909932017326355, "kl": 0.015407009981572628, "learning_rate": 9.988548161343683e-06, "loss": 0.0478, "num_tokens": 1949678.0, "reward": 0.2485119253396988, "reward_std": 0.35443389415740967, "rewards/unified_reward/mean": 0.2485119104385376, "rewards/unified_reward/std": 0.35443389415740967, "sampling/importance_sampling_ratio/max": 2.6936259269714355, "sampling/importance_sampling_ratio/mean": 0.5536675453186035, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6487839221954346, "sampling/sampling_logp_difference/mean": 0.01746990531682968, "step": 10, "step_time": 42.13068986614235 }, { "clip_ratio/high_max": 0.0007524502725573257, "clip_ratio/high_mean": 0.0001337998282906483, "clip_ratio/low_mean": 0.0003145152695651632, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004483150987653062, "completions/clipped_ratio": 0.8571429252624512, "completions/max_length": 2048.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 1821.696533203125, "completions/mean_terminated_length": 463.875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 1.1754736304283142, "epoch": 0.0013996691691054843, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.16884708404541016, "kl": 0.017221886198967695, "learning_rate": 9.987275734826315e-06, "loss": 0.0405, "num_tokens": 2175196.0, "reward": 0.2075892984867096, "reward_std": 0.28386619687080383, "rewards/unified_reward/mean": 0.2075892835855484, "rewards/unified_reward/std": 0.2838662266731262, "sampling/importance_sampling_ratio/max": 2.969181776046753, "sampling/importance_sampling_ratio/mean": 0.48354414105415344, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8563446998596191, "sampling/sampling_logp_difference/mean": 0.01688259281218052, "step": 11, "step_time": 45.286235192092136 }, { "clip_ratio/high_max": 0.0016344742616638541, "clip_ratio/high_mean": 0.00023349631737801246, "clip_ratio/low_mean": 3.0517578125e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00026401389550301246, "completions/clipped_ratio": 0.7321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 1652.946533203125, "completions/mean_terminated_length": 573.1333618164062, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 1.0353401601314545, "epoch": 0.0015269118208423463, "frac_reward_zero_std": 0.5, "grad_norm": 0.1363184154033661, "kl": 0.01703617535531521, "learning_rate": 9.986003308308946e-06, "loss": -0.0511, "num_tokens": 2381470.0, "reward": 0.2938988208770752, "reward_std": 0.4071679711341858, "rewards/unified_reward/mean": 0.2938988208770752, "rewards/unified_reward/std": 0.4071679711341858, "sampling/importance_sampling_ratio/max": 2.968996286392212, "sampling/importance_sampling_ratio/mean": 0.5105960965156555, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5993747711181641, "sampling/sampling_logp_difference/mean": 0.01684960350394249, "step": 12, "step_time": 41.826852882048115 }, { "clip_ratio/high_max": 0.002104724757373333, "clip_ratio/high_mean": 0.00040530665864935145, "clip_ratio/low_mean": 0.00031849508013692684, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007238017205963843, "completions/clipped_ratio": 0.8303571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 1768.634033203125, "completions/mean_terminated_length": 401.2105407714844, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 1.0868370234966278, "epoch": 0.0016541544725792085, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.16423547267913818, "kl": 0.020288918633013964, "learning_rate": 9.984730881791578e-06, "loss": -0.0008, "num_tokens": 2600677.0, "reward": 0.2822916507720947, "reward_std": 0.33131733536720276, "rewards/unified_reward/mean": 0.2822916507720947, "rewards/unified_reward/std": 0.33131733536720276, "sampling/importance_sampling_ratio/max": 2.7070095539093018, "sampling/importance_sampling_ratio/mean": 0.46946993470191956, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7240264415740967, "sampling/sampling_logp_difference/mean": 0.017231516540050507, "step": 13, "step_time": 46.157953256974 }, { "clip_ratio/high_max": 0.001922607421875, "clip_ratio/high_mean": 0.0002746582067629788, "clip_ratio/low_mean": 0.00027380619212635793, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005484643916133791, "completions/clipped_ratio": 0.8303571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 1770.446533203125, "completions/mean_terminated_length": 411.8947448730469, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 1.2487493455410004, "epoch": 0.0017813971243160708, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.20927460491657257, "kl": 0.020737343933433294, "learning_rate": 9.983458455274209e-06, "loss": -0.0366, "num_tokens": 2825287.0, "reward": 0.3526785969734192, "reward_std": 0.40191328525543213, "rewards/unified_reward/mean": 0.3526785671710968, "rewards/unified_reward/std": 0.40191328525543213, "sampling/importance_sampling_ratio/max": 2.5631420612335205, "sampling/importance_sampling_ratio/mean": 0.5169832110404968, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8339395523071289, "sampling/sampling_logp_difference/mean": 0.01591450534760952, "step": 14, "step_time": 45.45079468516633 }, { "clip_ratio/high_max": 0.0005118109984323382, "clip_ratio/high_mean": 7.311585795832798e-05, "clip_ratio/low_mean": 4.336178835728788e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011647764995359466, "completions/clipped_ratio": 0.7678571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1671.0357666015625, "completions/mean_terminated_length": 424.15386962890625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0463029742240906, "epoch": 0.001908639776052933, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.12709981203079224, "kl": 0.02086797682568431, "learning_rate": 9.98218602875684e-06, "loss": 0.0224, "num_tokens": 3033347.0, "reward": 0.212053582072258, "reward_std": 0.3319958746433258, "rewards/unified_reward/mean": 0.2120535671710968, "rewards/unified_reward/std": 0.3319959044456482, "sampling/importance_sampling_ratio/max": 2.440195083618164, "sampling/importance_sampling_ratio/mean": 0.5255193710327148, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5577147006988525, "sampling/sampling_logp_difference/mean": 0.0149671146646142, "step": 15, "step_time": 44.02011824515648 }, { "clip_ratio/high_max": 0.0006806025558034889, "clip_ratio/high_mean": 9.722893719299464e-05, "clip_ratio/low_mean": 0.00016933628648985177, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002665652173163835, "completions/clipped_ratio": 0.8214285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 1750.446533203125, "completions/mean_terminated_length": 381.70001220703125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 1.0386981666088104, "epoch": 0.002035882427789795, "frac_reward_zero_std": 0.5, "grad_norm": 0.1726647913455963, "kl": 0.023024487774819136, "learning_rate": 9.980913602239472e-06, "loss": -0.0327, "num_tokens": 3251109.0, "reward": 0.18869049847126007, "reward_std": 0.30347681045532227, "rewards/unified_reward/mean": 0.18869049847126007, "rewards/unified_reward/std": 0.30347681045532227, "sampling/importance_sampling_ratio/max": 2.5063118934631348, "sampling/importance_sampling_ratio/mean": 0.4515724778175354, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8402455449104309, "sampling/sampling_logp_difference/mean": 0.01564563438296318, "step": 16, "step_time": 46.15351501270197 }, { "clip_ratio/high_max": 0.000640869140625, "clip_ratio/high_mean": 9.1552734375e-05, "clip_ratio/low_mean": 5.2315849643491674e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00014386858401849167, "completions/clipped_ratio": 0.7767857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1718.884033203125, "completions/mean_terminated_length": 573.5599975585938, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.2037937194108963, "epoch": 0.0021631250795266575, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.20392942428588867, "kl": 0.026094529777765274, "learning_rate": 9.979641175722102e-06, "loss": 0.0259, "num_tokens": 3468760.0, "reward": 0.2760416865348816, "reward_std": 0.37318652868270874, "rewards/unified_reward/mean": 0.2760416865348816, "rewards/unified_reward/std": 0.37318652868270874, "sampling/importance_sampling_ratio/max": 2.86702823638916, "sampling/importance_sampling_ratio/mean": 0.5127540230751038, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49755382537841797, "sampling/sampling_logp_difference/mean": 0.01710599847137928, "step": 17, "step_time": 44.51719666784629 }, { "clip_ratio/high_max": 0.0018230413552373648, "clip_ratio/high_mean": 0.0004406820880831219, "clip_ratio/low_mean": 0.0003642453702923376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008049274329096079, "completions/clipped_ratio": 0.848214328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 1812.4554443359375, "completions/mean_terminated_length": 496.1764831542969, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 1.0438321977853775, "epoch": 0.0022903677312635195, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.28247955441474915, "kl": 0.024655427783727646, "learning_rate": 9.978368749204735e-06, "loss": 0.0138, "num_tokens": 3703931.0, "reward": 0.28586310148239136, "reward_std": 0.36451876163482666, "rewards/unified_reward/mean": 0.28586310148239136, "rewards/unified_reward/std": 0.36451876163482666, "sampling/importance_sampling_ratio/max": 2.878389596939087, "sampling/importance_sampling_ratio/mean": 0.5234385132789612, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4653863906860352, "sampling/sampling_logp_difference/mean": 0.015735171735286713, "step": 18, "step_time": 45.58537837699987 }, { "clip_ratio/high_max": 0.0013402343611232936, "clip_ratio/high_mean": 0.00019146205340803135, "clip_ratio/low_mean": 2.1798270608996972e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00021326032401702832, "completions/clipped_ratio": 0.9017857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 1906.5982666015625, "completions/mean_terminated_length": 608.2727661132812, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 1.2198505997657776, "epoch": 0.0024176103830003816, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.19618728756904602, "kl": 0.02384821278974414, "learning_rate": 9.977096322687364e-06, "loss": 0.0568, "num_tokens": 3970438.0, "reward": 0.2537202537059784, "reward_std": 0.2921881675720215, "rewards/unified_reward/mean": 0.253720223903656, "rewards/unified_reward/std": 0.2921881675720215, "sampling/importance_sampling_ratio/max": 2.5823168754577637, "sampling/importance_sampling_ratio/mean": 0.47074127197265625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8100886344909668, "sampling/sampling_logp_difference/mean": 0.018060918897390366, "step": 19, "step_time": 62.073841866105795 }, { "clip_ratio/high_max": 0.0013657623494509608, "clip_ratio/high_mean": 0.0002048020214715507, "clip_ratio/low_mean": 4.7956193157006055e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00025275822190451436, "completions/clipped_ratio": 0.7767857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 1721.9554443359375, "completions/mean_terminated_length": 587.3200073242188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.106226310133934, "epoch": 0.002544853034737244, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.2276524305343628, "kl": 0.029715027660131454, "learning_rate": 9.975823896169998e-06, "loss": 0.0404, "num_tokens": 4201529.0, "reward": 0.32857146859169006, "reward_std": 0.3440210521221161, "rewards/unified_reward/mean": 0.32857146859169006, "rewards/unified_reward/std": 0.3440210819244385, "sampling/importance_sampling_ratio/max": 2.932405471801758, "sampling/importance_sampling_ratio/mean": 0.39400655031204224, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.773353099822998, "sampling/sampling_logp_difference/mean": 0.019054049625992775, "step": 20, "step_time": 49.94204319897108 }, { "clip_ratio/high_max": 0.0028076171875, "clip_ratio/high_mean": 0.0005939612747170031, "clip_ratio/low_mean": 0.0003433457459323108, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009373070206493139, "completions/clipped_ratio": 0.8035714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 1743.3035888671875, "completions/mean_terminated_length": 496.8182067871094, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 1.3627731800079346, "epoch": 0.002672095686474106, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.35102689266204834, "kl": 0.03169079776853323, "learning_rate": 9.974551469652627e-06, "loss": 0.178, "num_tokens": 4418227.0, "reward": 0.3973214328289032, "reward_std": 0.27112963795661926, "rewards/unified_reward/mean": 0.3973214328289032, "rewards/unified_reward/std": 0.27112963795661926, "sampling/importance_sampling_ratio/max": 1.9801931381225586, "sampling/importance_sampling_ratio/mean": 0.390125036239624, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49995994567871094, "sampling/sampling_logp_difference/mean": 0.020502198487520218, "step": 21, "step_time": 44.16935825278051 }, { "clip_ratio/high_max": 0.0025367975467815995, "clip_ratio/high_mean": 0.0004304636240703985, "clip_ratio/low_mean": 0.000287048855170724, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007175124919740483, "completions/clipped_ratio": 0.7321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 1637.9107666015625, "completions/mean_terminated_length": 517.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 1.0630232989788055, "epoch": 0.0027993383382109685, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1436792016029358, "kl": 0.028179598040878773, "learning_rate": 9.97327904313526e-06, "loss": -0.004, "num_tokens": 4621393.0, "reward": 0.349702388048172, "reward_std": 0.32792139053344727, "rewards/unified_reward/mean": 0.3497023582458496, "rewards/unified_reward/std": 0.32792142033576965, "sampling/importance_sampling_ratio/max": 2.16426157951355, "sampling/importance_sampling_ratio/mean": 0.4612317681312561, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7724223136901855, "sampling/sampling_logp_difference/mean": 0.016973258927464485, "step": 22, "step_time": 42.961700269021094 }, { "clip_ratio/high_max": 0.00079345703125, "clip_ratio/high_mean": 0.00011335100225551287, "clip_ratio/low_mean": 4.3596541217993945e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001569475434735068, "completions/clipped_ratio": 0.7142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 1692.3035888671875, "completions/mean_terminated_length": 803.0625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 0.919525220990181, "epoch": 0.0029265809899478305, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.28166598081588745, "kl": 0.024575727991759777, "learning_rate": 9.97200661661789e-06, "loss": 0.0661, "num_tokens": 4864915.0, "reward": 0.24791666865348816, "reward_std": 0.3221869468688965, "rewards/unified_reward/mean": 0.24791666865348816, "rewards/unified_reward/std": 0.3221869468688965, "sampling/importance_sampling_ratio/max": 2.716870069503784, "sampling/importance_sampling_ratio/mean": 0.5202425718307495, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4929513931274414, "sampling/sampling_logp_difference/mean": 0.016098078340291977, "step": 23, "step_time": 54.40860485075973 }, { "clip_ratio/high_max": 0.0014882159302942455, "clip_ratio/high_mean": 0.00021260227731545456, "clip_ratio/low_mean": 8.275549407699145e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000295357771392446, "completions/clipped_ratio": 0.7142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1592.5982666015625, "completions/mean_terminated_length": 454.09375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.1388664841651917, "epoch": 0.0030538236416846926, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.29840007424354553, "kl": 0.032103432808071375, "learning_rate": 9.970734190100523e-06, "loss": 0.0856, "num_tokens": 5060982.0, "reward": 0.37738099694252014, "reward_std": 0.3914428651332855, "rewards/unified_reward/mean": 0.37738099694252014, "rewards/unified_reward/std": 0.3914428651332855, "sampling/importance_sampling_ratio/max": 2.980252742767334, "sampling/importance_sampling_ratio/mean": 0.5059217810630798, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7217617034912109, "sampling/sampling_logp_difference/mean": 0.018017727881669998, "step": 24, "step_time": 40.80201809294522 }, { "clip_ratio/high_max": 5.434782724478282e-05, "clip_ratio/high_mean": 7.763975190755446e-06, "clip_ratio/low_mean": 0.00010152431423193775, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001092882894226932, "completions/clipped_ratio": 0.7053571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 1584.3751220703125, "completions/mean_terminated_length": 474.48486328125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 1.1647514402866364, "epoch": 0.003181066293421555, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.33275464177131653, "kl": 0.03560458356514573, "learning_rate": 9.969461763583153e-06, "loss": 0.0706, "num_tokens": 5272104.0, "reward": 0.3013392984867096, "reward_std": 0.3609514832496643, "rewards/unified_reward/mean": 0.3013392984867096, "rewards/unified_reward/std": 0.3609514832496643, "sampling/importance_sampling_ratio/max": 2.608353853225708, "sampling/importance_sampling_ratio/mean": 0.6668501496315002, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.43196678161621094, "sampling/sampling_logp_difference/mean": 0.017252448946237564, "step": 25, "step_time": 43.755776630947366 }, { "clip_ratio/high_max": 0.0024955213302746415, "clip_ratio/high_mean": 0.0003565030492609367, "clip_ratio/low_mean": 2.8576001568580978e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038507905264850706, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1384.02685546875, "completions/mean_terminated_length": 530.346923828125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 0.8021742403507233, "epoch": 0.003308308945158417, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2553984820842743, "kl": 0.028776167891919613, "learning_rate": 9.968189337065786e-06, "loss": 0.0053, "num_tokens": 5458019.0, "reward": 0.24523811042308807, "reward_std": 0.33490192890167236, "rewards/unified_reward/mean": 0.24523809552192688, "rewards/unified_reward/std": 0.33490192890167236, "sampling/importance_sampling_ratio/max": 2.755847930908203, "sampling/importance_sampling_ratio/mean": 0.5766149163246155, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8653354644775391, "sampling/sampling_logp_difference/mean": 0.014556949958205223, "step": 26, "step_time": 41.73203463992104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00010986728921125177, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00010986728921125177, "completions/clipped_ratio": 0.7142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 1698.571533203125, "completions/mean_terminated_length": 825.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.980657160282135, "epoch": 0.003435551596895279, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.30468952655792236, "kl": 0.030222749337553978, "learning_rate": 9.966916910548416e-06, "loss": 0.0346, "num_tokens": 5673147.0, "reward": 0.336309552192688, "reward_std": 0.3783308267593384, "rewards/unified_reward/mean": 0.3363095223903656, "rewards/unified_reward/std": 0.3783308267593384, "sampling/importance_sampling_ratio/max": 2.8292524814605713, "sampling/importance_sampling_ratio/mean": 0.5543617010116577, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.6818015575408936, "sampling/sampling_logp_difference/mean": 0.01723049022257328, "step": 27, "step_time": 44.66397899528965 }, { "clip_ratio/high_max": 0.0035663539892993867, "clip_ratio/high_mean": 0.0006499892333522439, "clip_ratio/low_mean": 5.0012225983664393e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007000014447839931, "completions/clipped_ratio": 0.660714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1542.5535888671875, "completions/mean_terminated_length": 558.26318359375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 1.133430853486061, "epoch": 0.0035627942486321416, "frac_reward_zero_std": 0.5, "grad_norm": 0.19386565685272217, "kl": 0.030714900232851505, "learning_rate": 9.965644484031049e-06, "loss": 0.0108, "num_tokens": 5871177.0, "reward": 0.3683035969734192, "reward_std": 0.4195304811000824, "rewards/unified_reward/mean": 0.3683035671710968, "rewards/unified_reward/std": 0.41953045129776, "sampling/importance_sampling_ratio/max": 2.930713176727295, "sampling/importance_sampling_ratio/mean": 0.5653702020645142, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4976801872253418, "sampling/sampling_logp_difference/mean": 0.018163630738854408, "step": 28, "step_time": 40.82763177691959 }, { "clip_ratio/high_max": 0.00029214139794930816, "clip_ratio/high_mean": 4.173448542132974e-05, "clip_ratio/low_mean": 1.0407777153886855e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.214226257521659e-05, "completions/clipped_ratio": 0.6339285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 1502.1429443359375, "completions/mean_terminated_length": 556.8780517578125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0713493674993515, "epoch": 0.0036900369003690036, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2023574262857437, "kl": 0.03125083167105913, "learning_rate": 9.964372057513678e-06, "loss": 0.1081, "num_tokens": 6065857.0, "reward": 0.3370535969734192, "reward_std": 0.3602823317050934, "rewards/unified_reward/mean": 0.3370535671710968, "rewards/unified_reward/std": 0.360282301902771, "sampling/importance_sampling_ratio/max": 2.6795403957366943, "sampling/importance_sampling_ratio/mean": 0.6117401123046875, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49358129501342773, "sampling/sampling_logp_difference/mean": 0.017515165731310844, "step": 29, "step_time": 40.075427016709 }, { "clip_ratio/high_max": 0.001287622726522386, "clip_ratio/high_mean": 0.0001839461037889123, "clip_ratio/low_mean": 0.00012445677202777006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003084028867306188, "completions/clipped_ratio": 0.7142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 1630.6160888671875, "completions/mean_terminated_length": 587.15625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 1.357563078403473, "epoch": 0.003817279552105866, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.12820035219192505, "kl": 0.055331550538539886, "learning_rate": 9.963099630996312e-06, "loss": 0.0031, "num_tokens": 6272902.0, "reward": 0.45922622084617615, "reward_std": 0.31642428040504456, "rewards/unified_reward/mean": 0.45922619104385376, "rewards/unified_reward/std": 0.31642428040504456, "sampling/importance_sampling_ratio/max": 2.804027557373047, "sampling/importance_sampling_ratio/mean": 0.4617818295955658, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49263620376586914, "sampling/sampling_logp_difference/mean": 0.020562663674354553, "step": 30, "step_time": 43.86242316872813 }, { "clip_ratio/high_max": 0.0008998693956527859, "clip_ratio/high_mean": 0.00012855277236667462, "clip_ratio/low_mean": 6.975446376600303e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019830723249469884, "completions/clipped_ratio": 0.6875000596046448, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 1560.2679443359375, "completions/mean_terminated_length": 487.25714111328125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 1.2341766953468323, "epoch": 0.003944522203842728, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.4258900284767151, "kl": 0.03601947519928217, "learning_rate": 9.961827204478941e-06, "loss": 0.0083, "num_tokens": 6475460.0, "reward": 0.4196428656578064, "reward_std": 0.381623774766922, "rewards/unified_reward/mean": 0.4196428656578064, "rewards/unified_reward/std": 0.3816237449645996, "sampling/importance_sampling_ratio/max": 2.8391993045806885, "sampling/importance_sampling_ratio/mean": 0.5123783946037292, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5395584106445312, "sampling/sampling_logp_difference/mean": 0.018113691359758377, "step": 31, "step_time": 42.08984502009116 }, { "clip_ratio/high_max": 0.0009765625, "clip_ratio/high_mean": 0.00013950893116998486, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00013950893116998486, "completions/clipped_ratio": 0.5714285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 1382.0982666015625, "completions/mean_terminated_length": 494.22918701171875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 1.086118370294571, "epoch": 0.00407176485557959, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.18904294073581696, "kl": 0.03799611935392022, "learning_rate": 9.960554777961573e-06, "loss": 0.0858, "num_tokens": 6654079.0, "reward": 0.1979166865348816, "reward_std": 0.19443103671073914, "rewards/unified_reward/mean": 0.1979166716337204, "rewards/unified_reward/std": 0.19443103671073914, "sampling/importance_sampling_ratio/max": 2.6588196754455566, "sampling/importance_sampling_ratio/mean": 0.5212978720664978, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4968085289001465, "sampling/sampling_logp_difference/mean": 0.01695387437939644, "step": 32, "step_time": 38.187348230043426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6964285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 1599.27685546875, "completions/mean_terminated_length": 569.8529663085938, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 1.2409682273864746, "epoch": 0.004199007507316452, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.2529830038547516, "kl": 0.03484726697206497, "learning_rate": 9.959282351444206e-06, "loss": 0.0228, "num_tokens": 6884222.0, "reward": 0.17023809254169464, "reward_std": 0.21638846397399902, "rewards/unified_reward/mean": 0.17023809254169464, "rewards/unified_reward/std": 0.21638844907283783, "sampling/importance_sampling_ratio/max": 2.3074567317962646, "sampling/importance_sampling_ratio/mean": 0.592124342918396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4957118034362793, "sampling/sampling_logp_difference/mean": 0.017031386494636536, "step": 33, "step_time": 55.5271612838842 }, { "clip_ratio/high_max": 0.0020802695071324706, "clip_ratio/high_mean": 0.00041230287934013177, "clip_ratio/low_mean": 8.198060277209152e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004942834857502021, "completions/clipped_ratio": 0.6428571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 1531.107177734375, "completions/mean_terminated_length": 600.7000122070312, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 1.7582168877124786, "epoch": 0.004326250159053315, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.1997503787279129, "kl": 0.03978583961725235, "learning_rate": 9.958009924926836e-06, "loss": -0.0527, "num_tokens": 7096482.0, "reward": 0.5171131491661072, "reward_std": 0.3535778820514679, "rewards/unified_reward/mean": 0.5171130895614624, "rewards/unified_reward/std": 0.3535778820514679, "sampling/importance_sampling_ratio/max": 2.5338780879974365, "sampling/importance_sampling_ratio/mean": 0.4607158303260803, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0532703399658203, "sampling/sampling_logp_difference/mean": 0.019909752532839775, "step": 34, "step_time": 55.20488307438791 }, { "clip_ratio/high_max": 0.002464809178491123, "clip_ratio/high_mean": 0.00035211559224990197, "clip_ratio/low_mean": 3.0517578125e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038263317037490197, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1521.0535888671875, "completions/mean_terminated_length": 642.8095092773438, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7220804691314697, "epoch": 0.004453492810790177, "frac_reward_zero_std": 0.0, "grad_norm": 0.21036012470722198, "kl": 0.04183186311274767, "learning_rate": 9.956737498409469e-06, "loss": 0.0886, "num_tokens": 7297056.0, "reward": 0.37931549549102783, "reward_std": 0.34224021434783936, "rewards/unified_reward/mean": 0.37931546568870544, "rewards/unified_reward/std": 0.34224018454551697, "sampling/importance_sampling_ratio/max": 2.259063720703125, "sampling/importance_sampling_ratio/mean": 0.5323424935340881, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6346173286437988, "sampling/sampling_logp_difference/mean": 0.02036343328654766, "step": 35, "step_time": 41.41028846078552 }, { "clip_ratio/high_max": 0.0008795439280220307, "clip_ratio/high_mean": 0.0001256491323147202, "clip_ratio/low_mean": 0.00015997666014300194, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00028562579973367974, "completions/clipped_ratio": 0.6160714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 1414.6876220703125, "completions/mean_terminated_length": 398.4418640136719, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.5843049883842468, "epoch": 0.004580735462527039, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.250408798456192, "kl": 0.0430659269914031, "learning_rate": 9.955465071892098e-06, "loss": 0.0185, "num_tokens": 7489117.0, "reward": 0.2514881193637848, "reward_std": 0.2526095509529114, "rewards/unified_reward/mean": 0.2514881193637848, "rewards/unified_reward/std": 0.2526095509529114, "sampling/importance_sampling_ratio/max": 2.8878588676452637, "sampling/importance_sampling_ratio/mean": 0.5391421318054199, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7988696098327637, "sampling/sampling_logp_difference/mean": 0.02007574401795864, "step": 36, "step_time": 43.52736741001718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.598214328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1407.9376220703125, "completions/mean_terminated_length": 454.95556640625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.5937454402446747, "epoch": 0.004707978114263901, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.23580120503902435, "kl": 0.043106429278850555, "learning_rate": 9.954192645374731e-06, "loss": 0.0554, "num_tokens": 7666446.0, "reward": 0.376339316368103, "reward_std": 0.4026913642883301, "rewards/unified_reward/mean": 0.37633928656578064, "rewards/unified_reward/std": 0.4026913642883301, "sampling/importance_sampling_ratio/max": 2.6905839443206787, "sampling/importance_sampling_ratio/mean": 0.5723137855529785, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5642995834350586, "sampling/sampling_logp_difference/mean": 0.020185722038149834, "step": 37, "step_time": 39.21579040703364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5535714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 1353.5357666015625, "completions/mean_terminated_length": 492.3999938964844, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 1.3324599117040634, "epoch": 0.004835220766000763, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.29825106263160706, "kl": 0.04331509117037058, "learning_rate": 9.952920218857361e-06, "loss": 0.0309, "num_tokens": 7841882.0, "reward": 0.4680059850215912, "reward_std": 0.33993566036224365, "rewards/unified_reward/mean": 0.4680059552192688, "rewards/unified_reward/std": 0.33993563055992126, "sampling/importance_sampling_ratio/max": 2.9710774421691895, "sampling/importance_sampling_ratio/mean": 0.6345796585083008, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5294208526611328, "sampling/sampling_logp_difference/mean": 0.01737799122929573, "step": 38, "step_time": 37.449205379001796 }, { "clip_ratio/high_max": 0.001918228401336819, "clip_ratio/high_mean": 0.00028711158665828407, "clip_ratio/low_mean": 0.0001532847381895408, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004403963393997401, "completions/clipped_ratio": 0.6964285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 1571.759033203125, "completions/mean_terminated_length": 479.20587158203125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 1.8867179155349731, "epoch": 0.004962463417737626, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.12174507230520248, "kl": 0.04688160587102175, "learning_rate": 9.951647792339994e-06, "loss": -0.0516, "num_tokens": 8040311.0, "reward": 0.265625, "reward_std": 0.3459567725658417, "rewards/unified_reward/mean": 0.265625, "rewards/unified_reward/std": 0.3459567725658417, "sampling/importance_sampling_ratio/max": 2.6081597805023193, "sampling/importance_sampling_ratio/mean": 0.47043243050575256, "sampling/importance_sampling_ratio/min": 0.0004410727706272155, "sampling/sampling_logp_difference/max": 0.9003715515136719, "sampling/sampling_logp_difference/mean": 0.019568754360079765, "step": 39, "step_time": 41.80562071315944 }, { "clip_ratio/high_max": 0.0009727650322020054, "clip_ratio/high_mean": 0.00013896643213229254, "clip_ratio/low_mean": 6.572548591066152e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020469191440497525, "completions/clipped_ratio": 0.6428571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1522.6429443359375, "completions/mean_terminated_length": 577.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 1.8913870453834534, "epoch": 0.005089706069474488, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.1627778708934784, "kl": 0.05773734301328659, "learning_rate": 9.950375365822624e-06, "loss": -0.0125, "num_tokens": 8234415.0, "reward": 0.34017857909202576, "reward_std": 0.31931135058403015, "rewards/unified_reward/mean": 0.34017854928970337, "rewards/unified_reward/std": 0.31931132078170776, "sampling/importance_sampling_ratio/max": 2.8848350048065186, "sampling/importance_sampling_ratio/mean": 0.4254019260406494, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1280252933502197, "sampling/sampling_logp_difference/mean": 0.02180030196905136, "step": 40, "step_time": 41.72919622506015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6785714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1609.294677734375, "completions/mean_terminated_length": 683.138916015625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 1.5661423802375793, "epoch": 0.00521694872121135, "frac_reward_zero_std": 0.5, "grad_norm": 0.23106275498867035, "kl": 0.04794228170067072, "learning_rate": 9.949102939305257e-06, "loss": 0.1052, "num_tokens": 8449856.0, "reward": 0.3358631432056427, "reward_std": 0.3540005087852478, "rewards/unified_reward/mean": 0.3358631134033203, "rewards/unified_reward/std": 0.3540005087852478, "sampling/importance_sampling_ratio/max": 2.5266377925872803, "sampling/importance_sampling_ratio/mean": 0.5052246451377869, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9942240715026855, "sampling/sampling_logp_difference/mean": 0.019592521712183952, "step": 41, "step_time": 46.633664048975334 }, { "clip_ratio/high_max": 0.0012918896100018173, "clip_ratio/high_mean": 0.00019327497011545347, "clip_ratio/low_mean": 2.9693124815821648e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022296809402178042, "completions/clipped_ratio": 0.6339285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 1558.1876220703125, "completions/mean_terminated_length": 709.9755859375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 1.7647501826286316, "epoch": 0.005344191372948212, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.20505571365356445, "kl": 0.04273036029189825, "learning_rate": 9.947830512787887e-06, "loss": -0.0244, "num_tokens": 8644437.0, "reward": 0.3601190745830536, "reward_std": 0.38655686378479004, "rewards/unified_reward/mean": 0.3601190745830536, "rewards/unified_reward/std": 0.38655683398246765, "sampling/importance_sampling_ratio/max": 2.870969772338867, "sampling/importance_sampling_ratio/mean": 0.5087010264396667, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2371540069580078, "sampling/sampling_logp_difference/mean": 0.017949840053915977, "step": 42, "step_time": 40.07162801688537 }, { "clip_ratio/high_max": 0.00016228496679104865, "clip_ratio/high_mean": 2.3183567464002408e-05, "clip_ratio/low_mean": 4.542250144368154e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.860606890768395e-05, "completions/clipped_ratio": 0.5803571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1431.884033203125, "completions/mean_terminated_length": 579.8084716796875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 1.2550879269838333, "epoch": 0.005471434024685074, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.24562814831733704, "kl": 0.04389694146811962, "learning_rate": 9.94655808627052e-06, "loss": 0.0921, "num_tokens": 8841008.0, "reward": 0.29226192831993103, "reward_std": 0.3220222592353821, "rewards/unified_reward/mean": 0.29226189851760864, "rewards/unified_reward/std": 0.3220222592353821, "sampling/importance_sampling_ratio/max": 2.786583185195923, "sampling/importance_sampling_ratio/mean": 0.6845951080322266, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48686647415161133, "sampling/sampling_logp_difference/mean": 0.015619372949004173, "step": 43, "step_time": 46.63333671586588 }, { "clip_ratio/high_max": 0.0012612713035196066, "clip_ratio/high_mean": 0.00018018161426880397, "clip_ratio/low_mean": 6.846985615993617e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024865146951924544, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1361.0, "completions/mean_length": 1371.9554443359375, "completions/mean_terminated_length": 502.7550964355469, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 1.234447255730629, "epoch": 0.005598676676421937, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.23499596118927002, "kl": 0.060349732637405396, "learning_rate": 9.94528565975315e-06, "loss": 0.0569, "num_tokens": 9022219.0, "reward": 0.32678571343421936, "reward_std": 0.33476686477661133, "rewards/unified_reward/mean": 0.32678571343421936, "rewards/unified_reward/std": 0.33476686477661133, "sampling/importance_sampling_ratio/max": 2.9267404079437256, "sampling/importance_sampling_ratio/mean": 0.5743741989135742, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4973430633544922, "sampling/sampling_logp_difference/mean": 0.015717267990112305, "step": 44, "step_time": 40.762353440979496 }, { "clip_ratio/high_max": 0.0016080548230092973, "clip_ratio/high_mean": 0.0002616955116536701, "clip_ratio/low_mean": 0.00011951287979172776, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00038120839508337667, "completions/clipped_ratio": 0.5446428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 1303.8482666015625, "completions/mean_terminated_length": 413.7843322753906, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.8592419028282166, "epoch": 0.005725919328158799, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.26557615399360657, "kl": 0.05456068553030491, "learning_rate": 9.944013233235783e-06, "loss": 0.0726, "num_tokens": 9205610.0, "reward": 0.3563988506793976, "reward_std": 0.3586849272251129, "rewards/unified_reward/mean": 0.3563988208770752, "rewards/unified_reward/std": 0.3586849272251129, "sampling/importance_sampling_ratio/max": 2.9609508514404297, "sampling/importance_sampling_ratio/mean": 0.6502270698547363, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4910717010498047, "sampling/sampling_logp_difference/mean": 0.018736490979790688, "step": 45, "step_time": 42.31584817264229 }, { "clip_ratio/high_max": 0.001068115234375, "clip_ratio/high_mean": 0.0001525878915344947, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001525878915344947, "completions/clipped_ratio": 0.6428571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 1527.5535888671875, "completions/mean_terminated_length": 590.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.2340985238552094, "epoch": 0.005853161979895661, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.12732401490211487, "kl": 0.04292089492082596, "learning_rate": 9.942740806718412e-06, "loss": 0.0284, "num_tokens": 9401704.0, "reward": 0.2180059850215912, "reward_std": 0.2887283265590668, "rewards/unified_reward/mean": 0.2180059850215912, "rewards/unified_reward/std": 0.2887283265590668, "sampling/importance_sampling_ratio/max": 2.274038314819336, "sampling/importance_sampling_ratio/mean": 0.5468476414680481, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6743918061256409, "sampling/sampling_logp_difference/mean": 0.01484087947756052, "step": 46, "step_time": 40.43547564186156 }, { "clip_ratio/high_max": 0.0011861210805363953, "clip_ratio/high_mean": 0.000169445869687479, "clip_ratio/low_mean": 2.1798270608996972e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019124414302496007, "completions/clipped_ratio": 0.5178571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 1329.884033203125, "completions/mean_terminated_length": 558.5740966796875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2224094867706299, "epoch": 0.005980404631632523, "frac_reward_zero_std": 0.5, "grad_norm": 0.32322633266448975, "kl": 0.0453768614679575, "learning_rate": 9.941468380201044e-06, "loss": 0.0943, "num_tokens": 9583603.0, "reward": 0.3630952537059784, "reward_std": 0.41385799646377563, "rewards/unified_reward/mean": 0.3630952537059784, "rewards/unified_reward/std": 0.41385796666145325, "sampling/importance_sampling_ratio/max": 2.844754219055176, "sampling/importance_sampling_ratio/mean": 0.6642330288887024, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.211749076843262, "sampling/sampling_logp_difference/mean": 0.013903745450079441, "step": 47, "step_time": 38.81852338206954 }, { "clip_ratio/high_max": 0.00017973356443690136, "clip_ratio/high_mean": 2.567622414062498e-05, "clip_ratio/low_mean": 6.278882665355923e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.1955106805980904e-05, "completions/clipped_ratio": 0.6071428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 1441.5179443359375, "completions/mean_terminated_length": 504.227294921875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 1.1984282582998276, "epoch": 0.006107647283369385, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.17811252176761627, "kl": 0.04812593664973974, "learning_rate": 9.940195953683675e-06, "loss": 0.0029, "num_tokens": 9772109.0, "reward": 0.4141369163990021, "reward_std": 0.3662720322608948, "rewards/unified_reward/mean": 0.4141368865966797, "rewards/unified_reward/std": 0.3662720322608948, "sampling/importance_sampling_ratio/max": 2.770270824432373, "sampling/importance_sampling_ratio/mean": 0.5540902018547058, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.092456340789795, "sampling/sampling_logp_difference/mean": 0.014849220402538776, "step": 48, "step_time": 40.46888073114678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4642857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 1342.7232666015625, "completions/mean_terminated_length": 731.4833984375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 1.1426559835672379, "epoch": 0.006234889935106248, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.18711313605308533, "kl": 0.04707792494446039, "learning_rate": 9.938923527166307e-06, "loss": -0.0198, "num_tokens": 9950582.0, "reward": 0.2785714566707611, "reward_std": 0.3205462396144867, "rewards/unified_reward/mean": 0.2785714268684387, "rewards/unified_reward/std": 0.3205462396144867, "sampling/importance_sampling_ratio/max": 2.903702735900879, "sampling/importance_sampling_ratio/mean": 0.5990584492683411, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5092005729675293, "sampling/sampling_logp_difference/mean": 0.01464072335511446, "step": 49, "step_time": 37.10881744744256 }, { "clip_ratio/high_max": 0.0018705488182604313, "clip_ratio/high_mean": 0.00036824578637606464, "clip_ratio/low_mean": 0.00011901814741577255, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00048726391105446965, "completions/clipped_ratio": 0.6696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 1560.5804443359375, "completions/mean_terminated_length": 572.5675659179688, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 1.5067033469676971, "epoch": 0.00636213258684311, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.21489779651165009, "kl": 0.048728376626968384, "learning_rate": 9.937651100648938e-06, "loss": -0.0515, "num_tokens": 10148431.0, "reward": 0.46696433424949646, "reward_std": 0.3414989411830902, "rewards/unified_reward/mean": 0.4669643044471741, "rewards/unified_reward/std": 0.3414989411830902, "sampling/importance_sampling_ratio/max": 2.442664384841919, "sampling/importance_sampling_ratio/mean": 0.5673736333847046, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.45806384086608887, "sampling/sampling_logp_difference/mean": 0.01769907958805561, "step": 50, "step_time": 40.89671736792661 }, { "clip_ratio/high_max": 3.0517578125e-05, "clip_ratio/high_mean": 4.359654212748865e-06, "clip_ratio/low_mean": 6.0769589254050516e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 1.0436613138153916e-05, "completions/clipped_ratio": 0.535714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1329.77685546875, "completions/mean_terminated_length": 501.0577087402344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 1.5529195666313171, "epoch": 0.006489375238579972, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.13774451613426208, "kl": 0.05210867151618004, "learning_rate": 9.93637867413157e-06, "loss": 0.0259, "num_tokens": 10330838.0, "reward": 0.3523809611797333, "reward_std": 0.337539941072464, "rewards/unified_reward/mean": 0.3523809611797333, "rewards/unified_reward/std": 0.337539941072464, "sampling/importance_sampling_ratio/max": 2.4843335151672363, "sampling/importance_sampling_ratio/mean": 0.5389645099639893, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5123434066772461, "sampling/sampling_logp_difference/mean": 0.01701294630765915, "step": 51, "step_time": 46.54835203429684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4464285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 1182.7054443359375, "completions/mean_terminated_length": 484.8870849609375, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 1.3193459510803223, "epoch": 0.006616617890316834, "frac_reward_zero_std": 0.5, "grad_norm": 0.24470581114292145, "kl": 0.04838210716843605, "learning_rate": 9.9351062476142e-06, "loss": -0.0232, "num_tokens": 10482445.0, "reward": 0.3058035969734192, "reward_std": 0.3439358174800873, "rewards/unified_reward/mean": 0.3058035671710968, "rewards/unified_reward/std": 0.3439358174800873, "sampling/importance_sampling_ratio/max": 2.9412496089935303, "sampling/importance_sampling_ratio/mean": 0.6154170036315918, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7082525491714478, "sampling/sampling_logp_difference/mean": 0.017177818343043327, "step": 52, "step_time": 34.79104690812528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1129.90185546875, "completions/mean_terminated_length": 466.046142578125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.9458073973655701, "epoch": 0.006743860542053696, "frac_reward_zero_std": 0.5, "grad_norm": 0.2820485830307007, "kl": 0.04642812814563513, "learning_rate": 9.933833821096832e-06, "loss": 0.0376, "num_tokens": 10649666.0, "reward": 0.2886905074119568, "reward_std": 0.2918925881385803, "rewards/unified_reward/mean": 0.2886905074119568, "rewards/unified_reward/std": 0.2918925881385803, "sampling/importance_sampling_ratio/max": 2.8820669651031494, "sampling/importance_sampling_ratio/mean": 0.6849983334541321, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5908365249633789, "sampling/sampling_logp_difference/mean": 0.012867706827819347, "step": 53, "step_time": 45.84761166293174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1123.2232666015625, "completions/mean_terminated_length": 454.5384521484375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.9410253018140793, "epoch": 0.006871103193790558, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2990758717060089, "kl": 0.049800154753029346, "learning_rate": 9.932561394579464e-06, "loss": -0.0298, "num_tokens": 10798771.0, "reward": 0.21607144176959991, "reward_std": 0.24342122673988342, "rewards/unified_reward/mean": 0.21607144176959991, "rewards/unified_reward/std": 0.24342124164104462, "sampling/importance_sampling_ratio/max": 2.78424072265625, "sampling/importance_sampling_ratio/mean": 0.7783504128456116, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5669441223144531, "sampling/sampling_logp_difference/mean": 0.012777761556208134, "step": 54, "step_time": 35.20482253306545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3750000298023224, "completions/max_length": 2048.0, "completions/max_terminated_length": 1594.0, "completions/mean_length": 1030.52685546875, "completions/mean_terminated_length": 420.0428466796875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 1.0882972031831741, "epoch": 0.006998345845527421, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.40779098868370056, "kl": 0.051785184536129236, "learning_rate": 9.931288968062095e-06, "loss": 0.0592, "num_tokens": 10940774.0, "reward": 0.469494104385376, "reward_std": 0.4146822988986969, "rewards/unified_reward/mean": 0.4694940745830536, "rewards/unified_reward/std": 0.4146822988986969, "sampling/importance_sampling_ratio/max": 2.4382474422454834, "sampling/importance_sampling_ratio/mean": 0.694859504699707, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4691455364227295, "sampling/sampling_logp_difference/mean": 0.015274052508175373, "step": 55, "step_time": 34.63028818904422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 1126.83935546875, "completions/mean_terminated_length": 460.76922607421875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.1780275106430054, "epoch": 0.007125588497264283, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.19490008056163788, "kl": 0.050345863215625286, "learning_rate": 9.930016541544726e-06, "loss": 0.0826, "num_tokens": 11105204.0, "reward": 0.4052083194255829, "reward_std": 0.4032030701637268, "rewards/unified_reward/mean": 0.4052083194255829, "rewards/unified_reward/std": 0.4032030701637268, "sampling/importance_sampling_ratio/max": 2.682018280029297, "sampling/importance_sampling_ratio/mean": 0.6349343061447144, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0376503467559814, "sampling/sampling_logp_difference/mean": 0.01616464927792549, "step": 56, "step_time": 40.26595371286385 }, { "clip_ratio/high_max": 0.0005618311406578869, "clip_ratio/high_mean": 8.026159048313275e-05, "clip_ratio/low_mean": 2.7349804440746084e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00010761139492387883, "completions/clipped_ratio": 0.5714285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1950.0, "completions/mean_length": 1442.571533203125, "completions/mean_terminated_length": 635.3333740234375, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 1.645439773797989, "epoch": 0.007252831149001145, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.1603781133890152, "kl": 0.05560203455388546, "learning_rate": 9.928744115027358e-06, "loss": -0.0559, "num_tokens": 11318004.0, "reward": 0.3674107491970062, "reward_std": 0.38881632685661316, "rewards/unified_reward/mean": 0.36741071939468384, "rewards/unified_reward/std": 0.38881629705429077, "sampling/importance_sampling_ratio/max": 2.532306432723999, "sampling/importance_sampling_ratio/mean": 0.42244622111320496, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9363300800323486, "sampling/sampling_logp_difference/mean": 0.01943604275584221, "step": 57, "step_time": 56.23721193894744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 1151.3482666015625, "completions/mean_terminated_length": 503.0, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 1.2668726593255997, "epoch": 0.007380073800738007, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.22457250952720642, "kl": 0.0462976461276412, "learning_rate": 9.92747168850999e-06, "loss": 0.021, "num_tokens": 11480019.0, "reward": 0.40119048953056335, "reward_std": 0.34487318992614746, "rewards/unified_reward/mean": 0.40119048953056335, "rewards/unified_reward/std": 0.3448731601238251, "sampling/importance_sampling_ratio/max": 2.9765806198120117, "sampling/importance_sampling_ratio/mean": 0.5529577136039734, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47420716285705566, "sampling/sampling_logp_difference/mean": 0.016393668949604034, "step": 58, "step_time": 38.08783443667926 }, { "clip_ratio/high_max": 4.057124169776216e-05, "clip_ratio/high_mean": 5.795891866000602e-06, "clip_ratio/low_mean": 0.00015914415234874468, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016494004194100853, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1494.7501220703125, "completions/mean_terminated_length": 572.6666870117188, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.3950148820877075, "epoch": 0.007507316452474869, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.1518317461013794, "kl": 0.047594593837857246, "learning_rate": 9.92619926199262e-06, "loss": 0.0005, "num_tokens": 11671791.0, "reward": 0.3072916865348816, "reward_std": 0.38262295722961426, "rewards/unified_reward/mean": 0.3072916865348816, "rewards/unified_reward/std": 0.38262295722961426, "sampling/importance_sampling_ratio/max": 2.3219330310821533, "sampling/importance_sampling_ratio/mean": 0.48600324988365173, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5600152015686035, "sampling/sampling_logp_difference/mean": 0.01726531609892845, "step": 59, "step_time": 39.44704339141026 }, { "clip_ratio/high_max": 0.001373291015625, "clip_ratio/high_mean": 0.00019618443184299394, "clip_ratio/low_mean": 0.00017842969009507215, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003746141155716032, "completions/clipped_ratio": 0.5892857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 1418.9285888671875, "completions/mean_terminated_length": 516.3478393554688, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.539114534854889, "epoch": 0.007634559104211732, "frac_reward_zero_std": 0.5, "grad_norm": 0.1015680655837059, "kl": 0.054857795126736164, "learning_rate": 9.924926835475252e-06, "loss": -0.0057, "num_tokens": 11855079.0, "reward": 0.25089287757873535, "reward_std": 0.3553948700428009, "rewards/unified_reward/mean": 0.25089284777641296, "rewards/unified_reward/std": 0.3553948402404785, "sampling/importance_sampling_ratio/max": 2.809103012084961, "sampling/importance_sampling_ratio/mean": 0.5172562003135681, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5869202613830566, "sampling/sampling_logp_difference/mean": 0.018753081560134888, "step": 60, "step_time": 39.59412003075704 }, { "clip_ratio/high_max": 0.0013052574649918824, "clip_ratio/high_mean": 0.0001864653495431412, "clip_ratio/low_mean": 5.725278697354952e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001921906259667594, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1642.0, "completions/mean_length": 1361.544677734375, "completions/mean_terminated_length": 478.95916748046875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.6675967276096344, "epoch": 0.007761801755948594, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2650478482246399, "kl": 0.0546365836635232, "learning_rate": 9.923654408957883e-06, "loss": 0.0123, "num_tokens": 12047668.0, "reward": 0.40476194024086, "reward_std": 0.40437689423561096, "rewards/unified_reward/mean": 0.40476194024086, "rewards/unified_reward/std": 0.40437689423561096, "sampling/importance_sampling_ratio/max": 2.893214464187622, "sampling/importance_sampling_ratio/mean": 0.4738881587982178, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5530310869216919, "sampling/sampling_logp_difference/mean": 0.020893089473247528, "step": 61, "step_time": 48.99418081319891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 1.662677550484659e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 1.662677550484659e-05, "completions/clipped_ratio": 0.5178571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 1293.3929443359375, "completions/mean_terminated_length": 482.8888854980469, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "entropy": 1.5073977708816528, "epoch": 0.007889044407685456, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3236585557460785, "kl": 0.05789581499993801, "learning_rate": 9.922381982440515e-06, "loss": -0.0257, "num_tokens": 12240736.0, "reward": 0.42901790142059326, "reward_std": 0.39083436131477356, "rewards/unified_reward/mean": 0.4290178716182709, "rewards/unified_reward/std": 0.39083433151245117, "sampling/importance_sampling_ratio/max": 2.9283440113067627, "sampling/importance_sampling_ratio/mean": 0.5695586800575256, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49019789695739746, "sampling/sampling_logp_difference/mean": 0.019733723253011703, "step": 62, "step_time": 47.897733143297955 }, { "clip_ratio/high_max": 0.000823974609375, "clip_ratio/high_mean": 0.00014386858674697578, "clip_ratio/low_mean": 0.00021729404852521839, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00036116263072472066, "completions/clipped_ratio": 0.6428571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 1504.40185546875, "completions/mean_terminated_length": 525.9249877929688, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 1.7339535653591156, "epoch": 0.008016287059422318, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.12850695848464966, "kl": 0.05133673455566168, "learning_rate": 9.921109555923146e-06, "loss": 0.0195, "num_tokens": 12430349.0, "reward": 0.3869048058986664, "reward_std": 0.2897949814796448, "rewards/unified_reward/mean": 0.386904776096344, "rewards/unified_reward/std": 0.2897949814796448, "sampling/importance_sampling_ratio/max": 2.39733624458313, "sampling/importance_sampling_ratio/mean": 0.4029373526573181, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6958844661712646, "sampling/sampling_logp_difference/mean": 0.01982896961271763, "step": 63, "step_time": 40.02818918693811 }, { "clip_ratio/high_max": 0.0025127461994998157, "clip_ratio/high_mean": 0.0005304057049215771, "clip_ratio/low_mean": 2.0641111404984258e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005510468254215084, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 1418.4285888671875, "completions/mean_terminated_length": 608.9795532226562, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.2718296647071838, "epoch": 0.00814352971115918, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.30125927925109863, "kl": 0.044806099496781826, "learning_rate": 9.919837129405778e-06, "loss": 0.0163, "num_tokens": 12612173.0, "reward": 0.43139880895614624, "reward_std": 0.3987137973308563, "rewards/unified_reward/mean": 0.43139880895614624, "rewards/unified_reward/std": 0.3987137973308563, "sampling/importance_sampling_ratio/max": 2.9348857402801514, "sampling/importance_sampling_ratio/mean": 0.5589476227760315, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46633291244506836, "sampling/sampling_logp_difference/mean": 0.01776830665767193, "step": 64, "step_time": 38.30412045493722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.535714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1357.232177734375, "completions/mean_terminated_length": 560.1923217773438, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6369673609733582, "epoch": 0.008270772362896042, "frac_reward_zero_std": 0.5, "grad_norm": 0.17798422276973724, "kl": 0.056278061121702194, "learning_rate": 9.918564702888409e-06, "loss": 0.0009, "num_tokens": 12803783.0, "reward": 0.3244048058986664, "reward_std": 0.36269986629486084, "rewards/unified_reward/mean": 0.324404776096344, "rewards/unified_reward/std": 0.36269986629486084, "sampling/importance_sampling_ratio/max": 2.2716684341430664, "sampling/importance_sampling_ratio/mean": 0.5201153755187988, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.66654372215271, "sampling/sampling_logp_difference/mean": 0.01961241289973259, "step": 65, "step_time": 44.95112147578038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375000298023224, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 1274.446533203125, "completions/mean_terminated_length": 672.793701171875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 1.0202776044607162, "epoch": 0.008398015014632904, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.13657136261463165, "kl": 0.05052824039012194, "learning_rate": 9.91729227637104e-06, "loss": 0.0331, "num_tokens": 12968009.0, "reward": 0.5473214387893677, "reward_std": 0.3845720589160919, "rewards/unified_reward/mean": 0.5473214387893677, "rewards/unified_reward/std": 0.3845720589160919, "sampling/importance_sampling_ratio/max": 1.818737268447876, "sampling/importance_sampling_ratio/mean": 0.43427255749702454, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.506500244140625, "sampling/sampling_logp_difference/mean": 0.01658845879137516, "step": 66, "step_time": 36.57059252029285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5178571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 1309.1160888671875, "completions/mean_terminated_length": 515.5, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 1.5701675415039062, "epoch": 0.008525257666369766, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.254994660615921, "kl": 0.058296043425798416, "learning_rate": 9.916019849853672e-06, "loss": 0.0019, "num_tokens": 13143590.0, "reward": 0.2254464328289032, "reward_std": 0.35748091340065, "rewards/unified_reward/mean": 0.2254464328289032, "rewards/unified_reward/std": 0.35748091340065, "sampling/importance_sampling_ratio/max": 2.5512704849243164, "sampling/importance_sampling_ratio/mean": 0.6098593473434448, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6396143436431885, "sampling/sampling_logp_difference/mean": 0.01859266497194767, "step": 67, "step_time": 37.698648578953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.8959076644241577e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.8959076644241577e-05, "completions/clipped_ratio": 0.5535714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 1425.2857666015625, "completions/mean_terminated_length": 653.1199951171875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 1.374956637620926, "epoch": 0.00865250031810663, "frac_reward_zero_std": 0.5, "grad_norm": 0.19107282161712646, "kl": 0.05587987322360277, "learning_rate": 9.914747423336303e-06, "loss": 0.0331, "num_tokens": 13343062.0, "reward": 0.3794642984867096, "reward_std": 0.33154159784317017, "rewards/unified_reward/mean": 0.3794642984867096, "rewards/unified_reward/std": 0.33154159784317017, "sampling/importance_sampling_ratio/max": 2.8578507900238037, "sampling/importance_sampling_ratio/mean": 0.42732545733451843, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6927089691162109, "sampling/sampling_logp_difference/mean": 0.019729871302843094, "step": 68, "step_time": 41.6620054889936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.392857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 1061.321533203125, "completions/mean_terminated_length": 422.8823547363281, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.3140780329704285, "epoch": 0.008779742969843492, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.21369804441928864, "kl": 0.05595447774976492, "learning_rate": 9.913474996818935e-06, "loss": 0.081, "num_tokens": 13492930.0, "reward": 0.4754464626312256, "reward_std": 0.41548454761505127, "rewards/unified_reward/mean": 0.4754464328289032, "rewards/unified_reward/std": 0.41548454761505127, "sampling/importance_sampling_ratio/max": 2.981498956680298, "sampling/importance_sampling_ratio/mean": 0.6796631217002869, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48900651931762695, "sampling/sampling_logp_difference/mean": 0.0169901754707098, "step": 69, "step_time": 39.40999560011551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4464285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 1169.83935546875, "completions/mean_terminated_length": 461.6451416015625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 1.2264822125434875, "epoch": 0.008906985621580354, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.33650365471839905, "kl": 0.054274254478514194, "learning_rate": 9.912202570301566e-06, "loss": 0.1082, "num_tokens": 13646880.0, "reward": 0.35654765367507935, "reward_std": 0.35537034273147583, "rewards/unified_reward/mean": 0.35654762387275696, "rewards/unified_reward/std": 0.35537034273147583, "sampling/importance_sampling_ratio/max": 2.925198554992676, "sampling/importance_sampling_ratio/mean": 0.6838263273239136, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5464427471160889, "sampling/sampling_logp_difference/mean": 0.01862506940960884, "step": 70, "step_time": 35.89095195196569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4107142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1215.0, "completions/mean_terminated_length": 634.4242553710938, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.1029375493526459, "epoch": 0.009034228273317216, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.3143005073070526, "kl": 0.046241928823292255, "learning_rate": 9.910930143784198e-06, "loss": 0.035, "num_tokens": 13812104.0, "reward": 0.3571428656578064, "reward_std": 0.3957084119319916, "rewards/unified_reward/mean": 0.3571428656578064, "rewards/unified_reward/std": 0.39570844173431396, "sampling/importance_sampling_ratio/max": 2.9520294666290283, "sampling/importance_sampling_ratio/mean": 0.606016993522644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4887375831604004, "sampling/sampling_logp_difference/mean": 0.01702086068689823, "step": 71, "step_time": 36.68705596285872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.455357164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 1273.5535888671875, "completions/mean_terminated_length": 626.0655517578125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 1.422107309103012, "epoch": 0.009161470925054078, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.34059759974479675, "kl": 0.057873290963470936, "learning_rate": 9.909657717266829e-06, "loss": 0.0384, "num_tokens": 13999646.0, "reward": 0.3636905252933502, "reward_std": 0.34524667263031006, "rewards/unified_reward/mean": 0.3636905252933502, "rewards/unified_reward/std": 0.34524667263031006, "sampling/importance_sampling_ratio/max": 2.899066209793091, "sampling/importance_sampling_ratio/mean": 0.5829665064811707, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7413234710693359, "sampling/sampling_logp_difference/mean": 0.019157100468873978, "step": 72, "step_time": 38.80828766594641 }, { "clip_ratio/high_max": 0.0012503353937063366, "clip_ratio/high_mean": 0.00017861934975371696, "clip_ratio/low_mean": 5.800598501082277e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018441994689055718, "completions/clipped_ratio": 0.4642857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 1227.9107666015625, "completions/mean_terminated_length": 517.1666870117188, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 2.1138623654842377, "epoch": 0.00928871357679094, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.25627508759498596, "kl": 0.0733109824359417, "learning_rate": 9.90838529074946e-06, "loss": 0.0521, "num_tokens": 14160684.0, "reward": 0.48273810744285583, "reward_std": 0.3244763910770416, "rewards/unified_reward/mean": 0.48273807764053345, "rewards/unified_reward/std": 0.3244763910770416, "sampling/importance_sampling_ratio/max": 2.9500207901000977, "sampling/importance_sampling_ratio/mean": 0.5567519068717957, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4814291000366211, "sampling/sampling_logp_difference/mean": 0.023315194994211197, "step": 73, "step_time": 36.65564211667515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4910714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1266.482177734375, "completions/mean_terminated_length": 512.385986328125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 1.209050714969635, "epoch": 0.009415956228527802, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.1579582244157791, "kl": 0.05672494322061539, "learning_rate": 9.907112864232092e-06, "loss": 0.0151, "num_tokens": 14327978.0, "reward": 0.2931548058986664, "reward_std": 0.29000324010849, "rewards/unified_reward/mean": 0.293154776096344, "rewards/unified_reward/std": 0.2900032103061676, "sampling/importance_sampling_ratio/max": 2.6507275104522705, "sampling/importance_sampling_ratio/mean": 0.5533388257026672, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.752349853515625, "sampling/sampling_logp_difference/mean": 0.01746242865920067, "step": 74, "step_time": 36.56521396711469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5535714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 1316.6160888671875, "completions/mean_terminated_length": 409.6999816894531, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 1.8634631633758545, "epoch": 0.009543198880264664, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.2623961865901947, "kl": 0.062117879278957844, "learning_rate": 9.905840437714723e-06, "loss": 0.1064, "num_tokens": 14511991.0, "reward": 0.3757440745830536, "reward_std": 0.37031957507133484, "rewards/unified_reward/mean": 0.3757440745830536, "rewards/unified_reward/std": 0.37031954526901245, "sampling/importance_sampling_ratio/max": 2.838329792022705, "sampling/importance_sampling_ratio/mean": 0.6000520586967468, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4505753517150879, "sampling/sampling_logp_difference/mean": 0.019673410803079605, "step": 75, "step_time": 43.3458453391213 }, { "clip_ratio/high_max": 0.0008847900608088821, "clip_ratio/high_mean": 0.00012639858323382214, "clip_ratio/low_mean": 5.659053385898005e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00013205763389123604, "completions/clipped_ratio": 0.455357164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 1074.4285888671875, "completions/mean_terminated_length": 260.4590148925781, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.057448834180832, "epoch": 0.009670441532001526, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.23769395053386688, "kl": 0.0872725062072277, "learning_rate": 9.904568011197355e-06, "loss": 0.077, "num_tokens": 14648783.0, "reward": 0.3221726417541504, "reward_std": 0.3949277400970459, "rewards/unified_reward/mean": 0.3221726417541504, "rewards/unified_reward/std": 0.3949277400970459, "sampling/importance_sampling_ratio/max": 2.9459056854248047, "sampling/importance_sampling_ratio/mean": 0.5776380300521851, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.546504020690918, "sampling/sampling_logp_difference/mean": 0.021403688937425613, "step": 76, "step_time": 33.87444955203682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.455357164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 1106.607177734375, "completions/mean_terminated_length": 319.54095458984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.640007644891739, "epoch": 0.009797684183738388, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.2301185429096222, "kl": 0.11714457627385855, "learning_rate": 9.903295584679986e-06, "loss": 0.1036, "num_tokens": 14803459.0, "reward": 0.4296131432056427, "reward_std": 0.3859306871891022, "rewards/unified_reward/mean": 0.4296131134033203, "rewards/unified_reward/std": 0.3859306871891022, "sampling/importance_sampling_ratio/max": 2.7560064792633057, "sampling/importance_sampling_ratio/mean": 0.6496344804763794, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7069374322891235, "sampling/sampling_logp_difference/mean": 0.019768277183175087, "step": 77, "step_time": 39.14458641316742 }, { "clip_ratio/high_max": 0.0016658897802699357, "clip_ratio/high_mean": 0.0002434770722175017, "clip_ratio/low_mean": 5.4928154895605985e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024896988907130435, "completions/clipped_ratio": 0.455357164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1126.40185546875, "completions/mean_terminated_length": 355.8852233886719, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 1.810915470123291, "epoch": 0.009924926835475252, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2904089391231537, "kl": 0.06714983657002449, "learning_rate": 9.902023158162617e-06, "loss": 0.0278, "num_tokens": 14950944.0, "reward": 0.4092262387275696, "reward_std": 0.42910176515579224, "rewards/unified_reward/mean": 0.4092262089252472, "rewards/unified_reward/std": 0.4291017949581146, "sampling/importance_sampling_ratio/max": 2.464383363723755, "sampling/importance_sampling_ratio/mean": 0.6354340314865112, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49561548233032227, "sampling/sampling_logp_difference/mean": 0.021925469860434532, "step": 78, "step_time": 35.83007234451361 }, { "clip_ratio/high_max": 0.00031655587372370064, "clip_ratio/high_mean": 4.522226663539186e-05, "clip_ratio/low_mean": 4.007976167486049e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.530202831025235e-05, "completions/clipped_ratio": 0.3571428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 922.4732666015625, "completions/mean_terminated_length": 297.1805725097656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.8837078511714935, "epoch": 0.010052169487212114, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.27282023429870605, "kl": 0.08535495772957802, "learning_rate": 9.900750731645247e-06, "loss": 0.0519, "num_tokens": 15081733.0, "reward": 0.4657738506793976, "reward_std": 0.3431719243526459, "rewards/unified_reward/mean": 0.4657738208770752, "rewards/unified_reward/std": 0.3431719243526459, "sampling/importance_sampling_ratio/max": 2.795853853225708, "sampling/importance_sampling_ratio/mean": 0.6366590857505798, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47760701179504395, "sampling/sampling_logp_difference/mean": 0.02229791320860386, "step": 79, "step_time": 32.390752197941765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.330357164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 849.9017944335938, "completions/mean_terminated_length": 258.8399963378906, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.7143522500991821, "epoch": 0.010179412138948976, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.390005886554718, "kl": 0.08042342029511929, "learning_rate": 9.89947830512788e-06, "loss": 0.0283, "num_tokens": 15193562.0, "reward": 0.5130952596664429, "reward_std": 0.4103175699710846, "rewards/unified_reward/mean": 0.5130952596664429, "rewards/unified_reward/std": 0.4103175401687622, "sampling/importance_sampling_ratio/max": 2.8324713706970215, "sampling/importance_sampling_ratio/mean": 0.7327591776847839, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49454933404922485, "sampling/sampling_logp_difference/mean": 0.022130077704787254, "step": 80, "step_time": 30.388898227130994 }, { "clip_ratio/high_max": 0.0011463359696790576, "clip_ratio/high_mean": 0.00016376228450099006, "clip_ratio/low_mean": 1.123620768339606e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017499849218438612, "completions/clipped_ratio": 0.4196428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 1053.6429443359375, "completions/mean_terminated_length": 334.6461486816406, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.910503625869751, "epoch": 0.010306654790685838, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.40965667366981506, "kl": 0.09058106504380703, "learning_rate": 9.89820587861051e-06, "loss": 0.0834, "num_tokens": 15360386.0, "reward": 0.4125000238418579, "reward_std": 0.36368224024772644, "rewards/unified_reward/mean": 0.4124999940395355, "rewards/unified_reward/std": 0.36368224024772644, "sampling/importance_sampling_ratio/max": 2.4531280994415283, "sampling/importance_sampling_ratio/mean": 0.600866973400116, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4820270538330078, "sampling/sampling_logp_difference/mean": 0.022288909181952477, "step": 81, "step_time": 44.973374725785106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5178571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 1232.571533203125, "completions/mean_terminated_length": 356.7407531738281, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.4403324723243713, "epoch": 0.0104338974424227, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.35087817907333374, "kl": 0.091470280662179, "learning_rate": 9.896933452093143e-06, "loss": 0.0492, "num_tokens": 15519154.0, "reward": 0.4159226715564728, "reward_std": 0.3770996630191803, "rewards/unified_reward/mean": 0.4159226417541504, "rewards/unified_reward/std": 0.3770996630191803, "sampling/importance_sampling_ratio/max": 2.8456544876098633, "sampling/importance_sampling_ratio/mean": 0.49787598848342896, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9968366622924805, "sampling/sampling_logp_difference/mean": 0.027757719159126282, "step": 82, "step_time": 36.94151782011613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4017857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 962.3035888671875, "completions/mean_terminated_length": 233.10447692871094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.3240691423416138, "epoch": 0.010561140094159562, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.5751058459281921, "kl": 0.11252031102776527, "learning_rate": 9.895661025575773e-06, "loss": -0.0775, "num_tokens": 15662444.0, "reward": 0.3712798058986664, "reward_std": 0.3560025095939636, "rewards/unified_reward/mean": 0.371279776096344, "rewards/unified_reward/std": 0.35600247979164124, "sampling/importance_sampling_ratio/max": 2.469900608062744, "sampling/importance_sampling_ratio/mean": 0.6443725228309631, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.002781867980957, "sampling/sampling_logp_difference/mean": 0.026448559015989304, "step": 83, "step_time": 38.53726287744939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 9.006926120491698e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 9.006926120491698e-06, "completions/clipped_ratio": 0.4642857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1091.3035888671875, "completions/mean_terminated_length": 262.16668701171875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.422715485095978, "epoch": 0.010688382745896424, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3656783103942871, "kl": 0.09996412880718708, "learning_rate": 9.894388599058406e-06, "loss": -0.011, "num_tokens": 15806646.0, "reward": 0.34940478205680847, "reward_std": 0.35951200127601624, "rewards/unified_reward/mean": 0.34940478205680847, "rewards/unified_reward/std": 0.35951200127601624, "sampling/importance_sampling_ratio/max": 2.6906721591949463, "sampling/importance_sampling_ratio/mean": 0.6601525545120239, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.40915751457214355, "sampling/sampling_logp_difference/mean": 0.025103481486439705, "step": 84, "step_time": 34.75176134170033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 8.795889880275354e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.795889880275354e-05, "completions/clipped_ratio": 0.6339285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 1419.6160888671875, "completions/mean_terminated_length": 331.43902587890625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 2.4752771854400635, "epoch": 0.010815625397633286, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.2437400370836258, "kl": 0.10490662790834904, "learning_rate": 9.893116172541036e-06, "loss": -0.0236, "num_tokens": 16008731.0, "reward": 0.3742559850215912, "reward_std": 0.3773484230041504, "rewards/unified_reward/mean": 0.3742559552192688, "rewards/unified_reward/std": 0.377348393201828, "sampling/importance_sampling_ratio/max": 2.8905155658721924, "sampling/importance_sampling_ratio/mean": 0.5263186097145081, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8178582191467285, "sampling/sampling_logp_difference/mean": 0.02668244205415249, "step": 85, "step_time": 48.91595171811059 }, { "clip_ratio/high_max": 0.0014947803792892955, "clip_ratio/high_mean": 0.00022661900948151015, "clip_ratio/low_mean": 8.404248728766106e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003106615040451288, "completions/clipped_ratio": 0.6339285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 1395.3035888671875, "completions/mean_terminated_length": 265.0243835449219, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.6729207634925842, "epoch": 0.010942868049370148, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2646370530128479, "kl": 0.10212551802396774, "learning_rate": 9.891843746023669e-06, "loss": 0.0378, "num_tokens": 16196685.0, "reward": 0.3561011850833893, "reward_std": 0.42128071188926697, "rewards/unified_reward/mean": 0.3561011850833893, "rewards/unified_reward/std": 0.42128074169158936, "sampling/importance_sampling_ratio/max": 2.5957677364349365, "sampling/importance_sampling_ratio/mean": 0.6028844714164734, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.44208216667175293, "sampling/sampling_logp_difference/mean": 0.026679711416363716, "step": 86, "step_time": 44.09481039014645 }, { "clip_ratio/high_max": 0.002281671069795266, "clip_ratio/high_mean": 0.00046546193334506825, "clip_ratio/low_mean": 2.1798270608996972e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004872601930401288, "completions/clipped_ratio": 0.598214328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 1334.1607666015625, "completions/mean_terminated_length": 271.3333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.495092749595642, "epoch": 0.01107011070110701, "frac_reward_zero_std": 0.5, "grad_norm": 0.2341819405555725, "kl": 0.10219733230769634, "learning_rate": 9.890571319506298e-06, "loss": 0.0347, "num_tokens": 16363215.0, "reward": 0.4598214626312256, "reward_std": 0.42653220891952515, "rewards/unified_reward/mean": 0.4598214328289032, "rewards/unified_reward/std": 0.42653217911720276, "sampling/importance_sampling_ratio/max": 2.3621373176574707, "sampling/importance_sampling_ratio/mean": 0.49040910601615906, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5966962575912476, "sampling/sampling_logp_difference/mean": 0.027918418869376183, "step": 87, "step_time": 37.51807946735062 }, { "clip_ratio/high_max": 0.0010136434393643867, "clip_ratio/high_mean": 0.0002102010180351499, "clip_ratio/low_mean": 2.0044664779561572e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023024568145046942, "completions/clipped_ratio": 0.6875000596046448, "completions/max_length": 2048.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 1511.544677734375, "completions/mean_terminated_length": 331.3428649902344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.742771089076996, "epoch": 0.011197353352843874, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3037218153476715, "kl": 0.10001987963914871, "learning_rate": 9.889298892988931e-06, "loss": 0.0173, "num_tokens": 16577132.0, "reward": 0.322172611951828, "reward_std": 0.34700486063957214, "rewards/unified_reward/mean": 0.322172611951828, "rewards/unified_reward/std": 0.34700483083724976, "sampling/importance_sampling_ratio/max": 2.699075222015381, "sampling/importance_sampling_ratio/mean": 0.41856876015663147, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49785423278808594, "sampling/sampling_logp_difference/mean": 0.02742994949221611, "step": 88, "step_time": 51.72071473184042 }, { "clip_ratio/high_max": 0.0007695179665461183, "clip_ratio/high_mean": 0.0001271097280550748, "clip_ratio/low_mean": 4.098612771485932e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00016809585576993413, "completions/clipped_ratio": 0.6160714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 1337.0804443359375, "completions/mean_terminated_length": 196.3023223876953, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.557495653629303, "epoch": 0.011324596004580736, "frac_reward_zero_std": 0.5, "grad_norm": 0.32280778884887695, "kl": 0.09424899145960808, "learning_rate": 9.888026466471561e-06, "loss": 0.0513, "num_tokens": 16750989.0, "reward": 0.4650298058986664, "reward_std": 0.38174009323120117, "rewards/unified_reward/mean": 0.465029776096344, "rewards/unified_reward/std": 0.38174015283584595, "sampling/importance_sampling_ratio/max": 2.8244717121124268, "sampling/importance_sampling_ratio/mean": 0.5794817805290222, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8600373268127441, "sampling/sampling_logp_difference/mean": 0.02611408196389675, "step": 89, "step_time": 38.473882807884365 }, { "clip_ratio/high_max": 0.0003510414098855108, "clip_ratio/high_mean": 5.0148773880209774e-05, "clip_ratio/low_mean": 6.142061101854779e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011156938489875756, "completions/clipped_ratio": 0.4821428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 1120.6429443359375, "completions/mean_terminated_length": 257.2413635253906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.182372897863388, "epoch": 0.011451838656317598, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.31772443652153015, "kl": 0.09236514754593372, "learning_rate": 9.886754039954194e-06, "loss": -0.012, "num_tokens": 16910805.0, "reward": 0.3199405074119568, "reward_std": 0.4094042181968689, "rewards/unified_reward/mean": 0.3199405074119568, "rewards/unified_reward/std": 0.4094042181968689, "sampling/importance_sampling_ratio/max": 2.7242276668548584, "sampling/importance_sampling_ratio/mean": 0.6344141364097595, "sampling/importance_sampling_ratio/min": 0.0004413121787365526, "sampling/sampling_logp_difference/max": 0.6149616241455078, "sampling/sampling_logp_difference/mean": 0.0229774322360754, "step": 90, "step_time": 40.00698684388772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.660714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 1439.0179443359375, "completions/mean_terminated_length": 253.1052703857422, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.402832478284836, "epoch": 0.01157908130805446, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3631414473056793, "kl": 0.09140937775373459, "learning_rate": 9.885481613436824e-06, "loss": 0.0721, "num_tokens": 17096415.0, "reward": 0.3886905014514923, "reward_std": 0.3631018102169037, "rewards/unified_reward/mean": 0.3886904716491699, "rewards/unified_reward/std": 0.3631017804145813, "sampling/importance_sampling_ratio/max": 2.9806160926818848, "sampling/importance_sampling_ratio/mean": 0.5384889841079712, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4982442855834961, "sampling/sampling_logp_difference/mean": 0.024668609723448753, "step": 91, "step_time": 39.418066962389275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6071428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 1378.21435546875, "completions/mean_terminated_length": 343.0909118652344, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 2.6246114373207092, "epoch": 0.011706323959791322, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.20318756997585297, "kl": 0.10018211230635643, "learning_rate": 9.884209186919457e-06, "loss": -0.023, "num_tokens": 17272127.0, "reward": 0.44107145071029663, "reward_std": 0.3527308404445648, "rewards/unified_reward/mean": 0.44107145071029663, "rewards/unified_reward/std": 0.3527308702468872, "sampling/importance_sampling_ratio/max": 2.203400135040283, "sampling/importance_sampling_ratio/mean": 0.5021259188652039, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7002818584442139, "sampling/sampling_logp_difference/mean": 0.02507636323571205, "step": 92, "step_time": 38.233491857070476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5446428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 1238.7679443359375, "completions/mean_terminated_length": 270.8627624511719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.389509618282318, "epoch": 0.011833566611528184, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.18474619090557098, "kl": 0.11006925068795681, "learning_rate": 9.882936760402087e-06, "loss": 0.0135, "num_tokens": 17435613.0, "reward": 0.5254464149475098, "reward_std": 0.36624982953071594, "rewards/unified_reward/mean": 0.5254464149475098, "rewards/unified_reward/std": 0.36624982953071594, "sampling/importance_sampling_ratio/max": 2.6023032665252686, "sampling/importance_sampling_ratio/mean": 0.532792866230011, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4933462142944336, "sampling/sampling_logp_difference/mean": 0.023468980565667152, "step": 93, "step_time": 38.10718381102197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4910714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 1154.0, "completions/mean_terminated_length": 291.368408203125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.5081658363342285, "epoch": 0.011960809263265046, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.3693309724330902, "kl": 0.09661157988011837, "learning_rate": 9.881664333884718e-06, "loss": 0.0825, "num_tokens": 17591333.0, "reward": 0.6443452835083008, "reward_std": 0.3061314821243286, "rewards/unified_reward/mean": 0.6443452835083008, "rewards/unified_reward/std": 0.3061314523220062, "sampling/importance_sampling_ratio/max": 2.150104284286499, "sampling/importance_sampling_ratio/mean": 0.5794796943664551, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.479816198348999, "sampling/sampling_logp_difference/mean": 0.023785166442394257, "step": 94, "step_time": 38.024095623986796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.535714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1231.8125, "completions/mean_terminated_length": 290.0577087402344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.31498059630394, "epoch": 0.012088051915001908, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3790145516395569, "kl": 0.14272433333098888, "learning_rate": 9.880391907367351e-06, "loss": 0.006, "num_tokens": 17760680.0, "reward": 0.36622029542922974, "reward_std": 0.3188871145248413, "rewards/unified_reward/mean": 0.36622026562690735, "rewards/unified_reward/std": 0.3188871145248413, "sampling/importance_sampling_ratio/max": 2.9947211742401123, "sampling/importance_sampling_ratio/mean": 0.6254329681396484, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6230249404907227, "sampling/sampling_logp_difference/mean": 0.02199701778590679, "step": 95, "step_time": 37.412926314165816 }, { "clip_ratio/high_max": 0.00010820168972713873, "clip_ratio/high_mean": 1.5457384506589733e-05, "clip_ratio/low_mean": 1.716439874144271e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.262178324803244e-05, "completions/clipped_ratio": 0.535714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 1231.4554443359375, "completions/mean_terminated_length": 289.2884826660156, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.854539394378662, "epoch": 0.01221529456673877, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.43439486622810364, "kl": 0.12755890004336834, "learning_rate": 9.879119480849981e-06, "loss": -0.0252, "num_tokens": 17923003.0, "reward": 0.4583333730697632, "reward_std": 0.3694746196269989, "rewards/unified_reward/mean": 0.4583333432674408, "rewards/unified_reward/std": 0.3694746196269989, "sampling/importance_sampling_ratio/max": 2.667478322982788, "sampling/importance_sampling_ratio/mean": 0.605130136013031, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5593032836914062, "sampling/sampling_logp_difference/mean": 0.025465352460741997, "step": 96, "step_time": 38.347246256889775 }, { "clip_ratio/high_max": 0.0013645860308315605, "clip_ratio/high_mean": 0.0002820579393301159, "clip_ratio/low_mean": 0.00012104537699997309, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00040310331132786814, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 1347.3035888671875, "completions/mean_terminated_length": 179.4761962890625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.767608165740967, "epoch": 0.012342537218475632, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.33296313881874084, "kl": 0.18287094496190548, "learning_rate": 9.877847054332614e-06, "loss": 0.0552, "num_tokens": 18105037.0, "reward": 0.4799107313156128, "reward_std": 0.3725513219833374, "rewards/unified_reward/mean": 0.4799107015132904, "rewards/unified_reward/std": 0.3725513219833374, "sampling/importance_sampling_ratio/max": 2.797318935394287, "sampling/importance_sampling_ratio/mean": 0.533126175403595, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.725426435470581, "sampling/sampling_logp_difference/mean": 0.02666940726339817, "step": 97, "step_time": 39.41301047592424 }, { "clip_ratio/high_max": 0.002101440157275647, "clip_ratio/high_mean": 0.0003002057346748188, "clip_ratio/low_mean": 2.8503021894721314e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00032870876020751894, "completions/clipped_ratio": 0.5178571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 1113.8304443359375, "completions/mean_terminated_length": 110.46296691894531, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.510573923587799, "epoch": 0.012469779870212496, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.2970440089702606, "kl": 0.11873975768685341, "learning_rate": 9.876574627815244e-06, "loss": 0.0087, "num_tokens": 18275282.0, "reward": 0.419494092464447, "reward_std": 0.40158525109291077, "rewards/unified_reward/mean": 0.419494092464447, "rewards/unified_reward/std": 0.40158525109291077, "sampling/importance_sampling_ratio/max": 2.5812809467315674, "sampling/importance_sampling_ratio/mean": 0.5989059805870056, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.700385332107544, "sampling/sampling_logp_difference/mean": 0.024160189554095268, "step": 98, "step_time": 39.24445105786435 }, { "clip_ratio/high_max": 0.0022050021216273308, "clip_ratio/high_mean": 0.00041584541395423, "clip_ratio/low_mean": 5.9984698964399286e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004758301001857035, "completions/clipped_ratio": 0.723214328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1611.0, "completions/mean_length": 1586.8929443359375, "completions/mean_terminated_length": 382.06451416015625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 2.6358023285865784, "epoch": 0.012597022521949358, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.17768195271492004, "kl": 0.07744564116001129, "learning_rate": 9.875302201297877e-06, "loss": -0.0349, "num_tokens": 18472086.0, "reward": 0.2514881193637848, "reward_std": 0.3574220538139343, "rewards/unified_reward/mean": 0.2514881193637848, "rewards/unified_reward/std": 0.3574220538139343, "sampling/importance_sampling_ratio/max": 2.860272169113159, "sampling/importance_sampling_ratio/mean": 0.4227684438228607, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48222780227661133, "sampling/sampling_logp_difference/mean": 0.024067947641015053, "step": 99, "step_time": 41.56489620334469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5892857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 1297.857177734375, "completions/mean_terminated_length": 221.56521606445312, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5291078686714172, "epoch": 0.01272426517368622, "frac_reward_zero_std": 0.5, "grad_norm": 0.14703749120235443, "kl": 0.09441466815769672, "learning_rate": 9.874029774780507e-06, "loss": -0.0038, "num_tokens": 18640590.0, "reward": 0.4476190507411957, "reward_std": 0.38522419333457947, "rewards/unified_reward/mean": 0.4476190209388733, "rewards/unified_reward/std": 0.38522419333457947, "sampling/importance_sampling_ratio/max": 2.3533833026885986, "sampling/importance_sampling_ratio/mean": 0.5248671770095825, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7355923652648926, "sampling/sampling_logp_difference/mean": 0.02422824688255787, "step": 100, "step_time": 37.92198225483298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.8858833047706867e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.8858833047706867e-05, "completions/clipped_ratio": 0.6785714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 1470.2501220703125, "completions/mean_terminated_length": 250.55555725097656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.3144665956497192, "epoch": 0.012851507825423082, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.23662197589874268, "kl": 0.07987154833972454, "learning_rate": 9.87275734826314e-06, "loss": 0.0519, "num_tokens": 18826434.0, "reward": 0.3083333373069763, "reward_std": 0.4035937190055847, "rewards/unified_reward/mean": 0.3083333373069763, "rewards/unified_reward/std": 0.4035937190055847, "sampling/importance_sampling_ratio/max": 2.813206434249878, "sampling/importance_sampling_ratio/mean": 0.5821375250816345, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48939990997314453, "sampling/sampling_logp_difference/mean": 0.02227173186838627, "step": 101, "step_time": 39.124227217864245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 1.743861685099546e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 1.743861685099546e-05, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 1250.107177734375, "completions/mean_terminated_length": 224.24488830566406, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.0866742730140686, "epoch": 0.012978750477159944, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.2622333765029907, "kl": 0.10534469783306122, "learning_rate": 9.87148492174577e-06, "loss": 0.0073, "num_tokens": 18992990.0, "reward": 0.3898809850215912, "reward_std": 0.38833311200141907, "rewards/unified_reward/mean": 0.3898809552192688, "rewards/unified_reward/std": 0.3883330821990967, "sampling/importance_sampling_ratio/max": 1.9087847471237183, "sampling/importance_sampling_ratio/mean": 0.5910433530807495, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.492673397064209, "sampling/sampling_logp_difference/mean": 0.022313915193080902, "step": 102, "step_time": 38.56025215587579 }, { "clip_ratio/high_max": 0.0012074703990947455, "clip_ratio/high_mean": 0.00017249577285838313, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017249577285838313, "completions/clipped_ratio": 0.5803571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 1304.6875, "completions/mean_terminated_length": 276.7021179199219, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.649589717388153, "epoch": 0.013105993128896806, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2561420500278473, "kl": 0.09465093910694122, "learning_rate": 9.870212495228403e-06, "loss": -0.0061, "num_tokens": 19157739.0, "reward": 0.3452381193637848, "reward_std": 0.3515761196613312, "rewards/unified_reward/mean": 0.3452381193637848, "rewards/unified_reward/std": 0.3515760898590088, "sampling/importance_sampling_ratio/max": 2.9772491455078125, "sampling/importance_sampling_ratio/mean": 0.548335611820221, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.44234180450439453, "sampling/sampling_logp_difference/mean": 0.025543443858623505, "step": 103, "step_time": 37.6319637526758 }, { "clip_ratio/high_max": 0.001897023932542652, "clip_ratio/high_mean": 0.0004169760723016225, "clip_ratio/low_mean": 0.00022984526640357217, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006468213250627741, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 1395.857177734375, "completions/mean_terminated_length": 308.952392578125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 2.3823999762535095, "epoch": 0.013233235780633668, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.16364197432994843, "kl": 0.08775224909186363, "learning_rate": 9.868940068711032e-06, "loss": 0.0491, "num_tokens": 19331563.0, "reward": 0.4471726715564728, "reward_std": 0.4238857328891754, "rewards/unified_reward/mean": 0.4471726417541504, "rewards/unified_reward/std": 0.4238857328891754, "sampling/importance_sampling_ratio/max": 2.468686103820801, "sampling/importance_sampling_ratio/mean": 0.4773319661617279, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47565317153930664, "sampling/sampling_logp_difference/mean": 0.02412753365933895, "step": 104, "step_time": 38.00894902832806 }, { "clip_ratio/high_max": 0.0019488417892716825, "clip_ratio/high_mean": 0.00027840597613248974, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00027840597613248974, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 1286.0804443359375, "completions/mean_terminated_length": 306.4693908691406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.8416560292243958, "epoch": 0.01336047843237053, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.22038742899894714, "kl": 0.15506763570010662, "learning_rate": 9.867667642193665e-06, "loss": 0.0733, "num_tokens": 19508868.0, "reward": 0.414434552192688, "reward_std": 0.3839986324310303, "rewards/unified_reward/mean": 0.4144345223903656, "rewards/unified_reward/std": 0.3839986026287079, "sampling/importance_sampling_ratio/max": 2.881793260574341, "sampling/importance_sampling_ratio/mean": 0.5561447739601135, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49938392639160156, "sampling/sampling_logp_difference/mean": 0.020070310682058334, "step": 105, "step_time": 40.89181238482706 }, { "clip_ratio/high_max": 8.054123463807628e-05, "clip_ratio/high_mean": 1.1505891052365769e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 1.1505891052365769e-05, "completions/clipped_ratio": 0.4375000298023224, "completions/max_length": 2048.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 985.169677734375, "completions/mean_terminated_length": 158.52381896972656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.2384165227413177, "epoch": 0.013487721084107392, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.3260761797428131, "kl": 0.09107317216694355, "learning_rate": 9.866395215676295e-06, "loss": 0.1564, "num_tokens": 19677855.0, "reward": 0.3586309850215912, "reward_std": 0.3866580128669739, "rewards/unified_reward/mean": 0.3586309552192688, "rewards/unified_reward/std": 0.3866580128669739, "sampling/importance_sampling_ratio/max": 2.7680299282073975, "sampling/importance_sampling_ratio/mean": 0.6438447833061218, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49509763717651367, "sampling/sampling_logp_difference/mean": 0.02347174473106861, "step": 106, "step_time": 46.29151956201531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.392857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 974.9910888671875, "completions/mean_terminated_length": 280.6911926269531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.273023784160614, "epoch": 0.013614963735844254, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.5760468244552612, "kl": 0.12135850824415684, "learning_rate": 9.865122789158928e-06, "loss": 0.1404, "num_tokens": 19815798.0, "reward": 0.43229174613952637, "reward_std": 0.4025834798812866, "rewards/unified_reward/mean": 0.432291716337204, "rewards/unified_reward/std": 0.402583509683609, "sampling/importance_sampling_ratio/max": 2.6531710624694824, "sampling/importance_sampling_ratio/mean": 0.6970413327217102, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48960065841674805, "sampling/sampling_logp_difference/mean": 0.02571874111890793, "step": 107, "step_time": 34.68782837619074 }, { "clip_ratio/high_max": 0.0001851303386501968, "clip_ratio/high_mean": 2.644719097588677e-05, "clip_ratio/low_mean": 1.7636745724303182e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 4.408393670018995e-05, "completions/clipped_ratio": 0.5178571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1244.0179443359375, "completions/mean_terminated_length": 380.4814758300781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.9279430508613586, "epoch": 0.013742206387581116, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.17345218360424042, "kl": 0.27635412104427814, "learning_rate": 9.863850362641558e-06, "loss": 0.024, "num_tokens": 19995696.0, "reward": 0.4522321820259094, "reward_std": 0.44783708453178406, "rewards/unified_reward/mean": 0.45223215222358704, "rewards/unified_reward/std": 0.44783708453178406, "sampling/importance_sampling_ratio/max": 2.1172196865081787, "sampling/importance_sampling_ratio/mean": 0.5462964177131653, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6817548274993896, "sampling/sampling_logp_difference/mean": 0.021992573514580727, "step": 108, "step_time": 40.862235945183784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375000298023224, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1056.4285888671875, "completions/mean_terminated_length": 285.20635986328125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.045448660850525, "epoch": 0.01386944903931798, "frac_reward_zero_std": 0.5, "grad_norm": 0.18325065076351166, "kl": 0.11687890626490116, "learning_rate": 9.86257793612419e-06, "loss": 0.0538, "num_tokens": 20135984.0, "reward": 0.46726194024086, "reward_std": 0.4080761671066284, "rewards/unified_reward/mean": 0.46726194024086, "rewards/unified_reward/std": 0.4080761969089508, "sampling/importance_sampling_ratio/max": 2.5983693599700928, "sampling/importance_sampling_ratio/mean": 0.6311143040657043, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5121381282806396, "sampling/sampling_logp_difference/mean": 0.02203000709414482, "step": 109, "step_time": 34.23320445488207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3750000298023224, "completions/max_length": 2048.0, "completions/max_terminated_length": 1847.0, "completions/mean_length": 940.2232666015625, "completions/mean_terminated_length": 275.5571594238281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.826830416917801, "epoch": 0.013996691691054842, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.28605133295059204, "kl": 0.10996402613818645, "learning_rate": 9.86130550960682e-06, "loss": 0.0346, "num_tokens": 20265897.0, "reward": 0.41711312532424927, "reward_std": 0.3843892216682434, "rewards/unified_reward/mean": 0.4171130955219269, "rewards/unified_reward/std": 0.3843892514705658, "sampling/importance_sampling_ratio/max": 2.8689613342285156, "sampling/importance_sampling_ratio/mean": 0.701940655708313, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49773073196411133, "sampling/sampling_logp_difference/mean": 0.02248402312397957, "step": 110, "step_time": 32.26038661180064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.267857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 705.5357666015625, "completions/mean_terminated_length": 214.39024353027344, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 2.0422787964344025, "epoch": 0.014123934342791704, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.5627869963645935, "kl": 0.1411299891769886, "learning_rate": 9.860033083089452e-06, "loss": 0.1979, "num_tokens": 20369309.0, "reward": 0.43169647455215454, "reward_std": 0.32652169466018677, "rewards/unified_reward/mean": 0.43169644474983215, "rewards/unified_reward/std": 0.32652169466018677, "sampling/importance_sampling_ratio/max": 2.400120496749878, "sampling/importance_sampling_ratio/mean": 0.7164126634597778, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9227216243743896, "sampling/sampling_logp_difference/mean": 0.023344416171312332, "step": 111, "step_time": 30.104106766870245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 437.2410888671875, "completions/mean_terminated_length": 168.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.767555683851242, "epoch": 0.014251176994528566, "frac_reward_zero_std": 0.5, "grad_norm": 0.5364320874214172, "kl": 0.19965175166726112, "learning_rate": 9.858760656572083e-06, "loss": 0.1093, "num_tokens": 20438072.0, "reward": 0.4635416865348816, "reward_std": 0.3709585666656494, "rewards/unified_reward/mean": 0.4635416865348816, "rewards/unified_reward/std": 0.3709585964679718, "sampling/importance_sampling_ratio/max": 2.8980090618133545, "sampling/importance_sampling_ratio/mean": 0.9059425592422485, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.602729082107544, "sampling/sampling_logp_difference/mean": 0.024319393560290337, "step": 112, "step_time": 24.824311978183687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 539.0178833007812, "completions/mean_terminated_length": 190.7912139892578, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8865955770015717, "epoch": 0.014378419646265428, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.7239094376564026, "kl": 0.14762437902390957, "learning_rate": 9.857488230054715e-06, "loss": 0.095, "num_tokens": 20529802.0, "reward": 0.38482144474983215, "reward_std": 0.3822615444660187, "rewards/unified_reward/mean": 0.38482141494750977, "rewards/unified_reward/std": 0.38226157426834106, "sampling/importance_sampling_ratio/max": 2.71977162361145, "sampling/importance_sampling_ratio/mean": 0.8022545576095581, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.520255446434021, "sampling/sampling_logp_difference/mean": 0.024975767359137535, "step": 113, "step_time": 31.748405362013727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1713.0, "completions/mean_length": 532.1339721679688, "completions/mean_terminated_length": 260.8736877441406, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.775121808052063, "epoch": 0.01450566229800229, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.5660966038703918, "kl": 0.26598022133111954, "learning_rate": 9.856215803537346e-06, "loss": 0.1379, "num_tokens": 20612089.0, "reward": 0.4444940686225891, "reward_std": 0.2926560938358307, "rewards/unified_reward/mean": 0.4444940388202667, "rewards/unified_reward/std": 0.2926560938358307, "sampling/importance_sampling_ratio/max": 2.4427621364593506, "sampling/importance_sampling_ratio/mean": 0.7658088803291321, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8750953674316406, "sampling/sampling_logp_difference/mean": 0.02429373934864998, "step": 114, "step_time": 25.432397505966946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 630.3214721679688, "completions/mean_terminated_length": 263.9550476074219, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 1.8090127110481262, "epoch": 0.014632904949739152, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.32206374406814575, "kl": 0.14330532774329185, "learning_rate": 9.854943377019978e-06, "loss": 0.0905, "num_tokens": 20716885.0, "reward": 0.5395833849906921, "reward_std": 0.3521713614463806, "rewards/unified_reward/mean": 0.5395833253860474, "rewards/unified_reward/std": 0.3521713614463806, "sampling/importance_sampling_ratio/max": 2.4828975200653076, "sampling/importance_sampling_ratio/mean": 0.7592765092849731, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48456525802612305, "sampling/sampling_logp_difference/mean": 0.02113695815205574, "step": 115, "step_time": 35.025336327031255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1519.0, "completions/mean_length": 706.6250610351562, "completions/mean_terminated_length": 237.95179748535156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.9153054654598236, "epoch": 0.014760147601476014, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.31240054965019226, "kl": 0.1374611333012581, "learning_rate": 9.853670950502609e-06, "loss": 0.0028, "num_tokens": 20826595.0, "reward": 0.5029762387275696, "reward_std": 0.40946152806282043, "rewards/unified_reward/mean": 0.5029762387275696, "rewards/unified_reward/std": 0.40946152806282043, "sampling/importance_sampling_ratio/max": 2.5902884006500244, "sampling/importance_sampling_ratio/mean": 0.632017970085144, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8093652725219727, "sampling/sampling_logp_difference/mean": 0.024598028510808945, "step": 116, "step_time": 32.26931599969976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.133928582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 632.6607666015625, "completions/mean_terminated_length": 413.7937927246094, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 1.7886682450771332, "epoch": 0.014887390253212876, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3417836129665375, "kl": 0.12175150215625763, "learning_rate": 9.85239852398524e-06, "loss": -0.0299, "num_tokens": 20921773.0, "reward": 0.5108631253242493, "reward_std": 0.3867015540599823, "rewards/unified_reward/mean": 0.5108631253242493, "rewards/unified_reward/std": 0.3867015540599823, "sampling/importance_sampling_ratio/max": 2.5742270946502686, "sampling/importance_sampling_ratio/mean": 0.6578264236450195, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4702167510986328, "sampling/sampling_logp_difference/mean": 0.024096298962831497, "step": 117, "step_time": 26.753968943841755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 486.9464416503906, "completions/mean_terminated_length": 281.9595947265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.6674061119556427, "epoch": 0.015014632904949738, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.6370129585266113, "kl": 0.16532320901751518, "learning_rate": 9.851126097467872e-06, "loss": 0.0457, "num_tokens": 21000495.0, "reward": 0.6166666746139526, "reward_std": 0.3878471553325653, "rewards/unified_reward/mean": 0.6166666746139526, "rewards/unified_reward/std": 0.3878471553325653, "sampling/importance_sampling_ratio/max": 2.3484690189361572, "sampling/importance_sampling_ratio/mean": 0.8019793629646301, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49065256118774414, "sampling/sampling_logp_difference/mean": 0.022777460515499115, "step": 118, "step_time": 24.293280977290124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 368.0982360839844, "completions/mean_terminated_length": 256.1047668457031, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.1269321590662003, "epoch": 0.015141875556686602, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.6646873354911804, "kl": 0.2561023570597172, "learning_rate": 9.849853670950503e-06, "loss": -0.0077, "num_tokens": 21065762.0, "reward": 0.5419643521308899, "reward_std": 0.3699145019054413, "rewards/unified_reward/mean": 0.5419643521308899, "rewards/unified_reward/std": 0.36991453170776367, "sampling/importance_sampling_ratio/max": 2.6895735263824463, "sampling/importance_sampling_ratio/mean": 0.9097280502319336, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4467456340789795, "sampling/sampling_logp_difference/mean": 0.018577806651592255, "step": 119, "step_time": 23.470694329123944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0892857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 482.107177734375, "completions/mean_terminated_length": 328.5882568359375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.7214686870574951, "epoch": 0.015269118208423464, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.6633262634277344, "kl": 0.22933654859662056, "learning_rate": 9.848581244433135e-06, "loss": -0.0362, "num_tokens": 21151358.0, "reward": 0.46711310744285583, "reward_std": 0.39466193318367004, "rewards/unified_reward/mean": 0.46711307764053345, "rewards/unified_reward/std": 0.39466193318367004, "sampling/importance_sampling_ratio/max": 2.7194340229034424, "sampling/importance_sampling_ratio/mean": 0.7888540625572205, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5479533672332764, "sampling/sampling_logp_difference/mean": 0.02118847146630287, "step": 120, "step_time": 25.68748825765215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 384.3035888671875, "completions/mean_terminated_length": 203.10890197753906, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.9419007897377014, "epoch": 0.015396360860160326, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.6012699007987976, "kl": 0.2404048591852188, "learning_rate": 9.847308817915766e-06, "loss": 0.0665, "num_tokens": 21245792.0, "reward": 0.45997029542922974, "reward_std": 0.3195534646511078, "rewards/unified_reward/mean": 0.45997026562690735, "rewards/unified_reward/std": 0.3195534646511078, "sampling/importance_sampling_ratio/max": 2.4783856868743896, "sampling/importance_sampling_ratio/mean": 0.8521426320075989, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.513817548751831, "sampling/sampling_logp_difference/mean": 0.023007670417428017, "step": 121, "step_time": 37.16991008305922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0357142873108387, "completions/max_length": 2048.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 271.51788330078125, "completions/mean_terminated_length": 205.72222900390625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.663751482963562, "epoch": 0.015523603511897188, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.8381054401397705, "kl": 0.25058531016111374, "learning_rate": 9.846036391398398e-06, "loss": 0.0853, "num_tokens": 21297178.0, "reward": 0.3508928716182709, "reward_std": 0.38367486000061035, "rewards/unified_reward/mean": 0.3508928418159485, "rewards/unified_reward/std": 0.38367486000061035, "sampling/importance_sampling_ratio/max": 2.0547406673431396, "sampling/importance_sampling_ratio/mean": 0.8571892976760864, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.499969482421875, "sampling/sampling_logp_difference/mean": 0.024347130209207535, "step": 122, "step_time": 20.791431099176407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0446428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 302.64288330078125, "completions/mean_terminated_length": 221.0841064453125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 1.5615538358688354, "epoch": 0.01565084616363405, "frac_reward_zero_std": 0.5, "grad_norm": 0.6482431292533875, "kl": 0.3310888931155205, "learning_rate": 9.844763964881029e-06, "loss": 0.0256, "num_tokens": 21352026.0, "reward": 0.3943452537059784, "reward_std": 0.333329975605011, "rewards/unified_reward/mean": 0.3943452537059784, "rewards/unified_reward/std": 0.333329975605011, "sampling/importance_sampling_ratio/max": 2.8080646991729736, "sampling/importance_sampling_ratio/mean": 0.8698590993881226, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.878891944885254, "sampling/sampling_logp_difference/mean": 0.023222636431455612, "step": 123, "step_time": 21.277153628878295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 412.52679443359375, "completions/mean_terminated_length": 234.4059295654297, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.674216389656067, "epoch": 0.015778088815370912, "frac_reward_zero_std": 0.5, "grad_norm": 0.5026310086250305, "kl": 0.2188900038599968, "learning_rate": 9.84349153836366e-06, "loss": 0.0214, "num_tokens": 21413677.0, "reward": 0.4724702835083008, "reward_std": 0.41136085987091064, "rewards/unified_reward/mean": 0.4724702537059784, "rewards/unified_reward/std": 0.41136085987091064, "sampling/importance_sampling_ratio/max": 2.8763320446014404, "sampling/importance_sampling_ratio/mean": 0.8154711127281189, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49861812591552734, "sampling/sampling_logp_difference/mean": 0.021994438022375107, "step": 124, "step_time": 22.47631218493916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1762.0, "completions/mean_length": 347.33038330078125, "completions/mean_terminated_length": 198.72816467285156, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 1.7800407111644745, "epoch": 0.015905331467107774, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.7425587773323059, "kl": 0.2361934669315815, "learning_rate": 9.842219111846292e-06, "loss": 0.0194, "num_tokens": 21490506.0, "reward": 0.4401785731315613, "reward_std": 0.32478955388069153, "rewards/unified_reward/mean": 0.4401785731315613, "rewards/unified_reward/std": 0.32478955388069153, "sampling/importance_sampling_ratio/max": 2.844170331954956, "sampling/importance_sampling_ratio/mean": 0.842910885810852, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5155305862426758, "sampling/sampling_logp_difference/mean": 0.022411607205867767, "step": 125, "step_time": 25.54264137125574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 619.8125, "completions/mean_terminated_length": 270.70001220703125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7250426411628723, "epoch": 0.016032574118844636, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.36944684386253357, "kl": 0.3126302435994148, "learning_rate": 9.840946685328923e-06, "loss": 0.0458, "num_tokens": 21608837.0, "reward": 0.5498512387275696, "reward_std": 0.4332379996776581, "rewards/unified_reward/mean": 0.5498512387275696, "rewards/unified_reward/std": 0.4332379996776581, "sampling/importance_sampling_ratio/max": 2.3838436603546143, "sampling/importance_sampling_ratio/mean": 0.7618723511695862, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5893445014953613, "sampling/sampling_logp_difference/mean": 0.02211265079677105, "step": 126, "step_time": 36.26564237428829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 620.5178833007812, "completions/mean_terminated_length": 271.5777893066406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.9307145476341248, "epoch": 0.0161598167705815, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.5134349465370178, "kl": 0.4075772762298584, "learning_rate": 9.839674258811555e-06, "loss": 0.0587, "num_tokens": 21699239.0, "reward": 0.3717261850833893, "reward_std": 0.37344831228256226, "rewards/unified_reward/mean": 0.3717261850833893, "rewards/unified_reward/std": 0.37344834208488464, "sampling/importance_sampling_ratio/max": 2.987459421157837, "sampling/importance_sampling_ratio/mean": 0.7811146378517151, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.40700697898864746, "sampling/sampling_logp_difference/mean": 0.025565769523382187, "step": 127, "step_time": 26.17187184514478 }, { "clip_ratio/high_max": 0.00015945217455737293, "clip_ratio/high_mean": 2.277888142998563e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.277888142998563e-05, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 462.6875305175781, "completions/mean_terminated_length": 159.1170196533203, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.4884356260299683, "epoch": 0.01628705942231836, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.6831024885177612, "kl": 0.20983342081308365, "learning_rate": 9.838401832294186e-06, "loss": 0.2011, "num_tokens": 21777668.0, "reward": 0.5543155074119568, "reward_std": 0.40140360593795776, "rewards/unified_reward/mean": 0.5543155074119568, "rewards/unified_reward/std": 0.4014035761356354, "sampling/importance_sampling_ratio/max": 2.550935983657837, "sampling/importance_sampling_ratio/mean": 0.8673187494277954, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4685349464416504, "sampling/sampling_logp_difference/mean": 0.018184835091233253, "step": 128, "step_time": 24.592129057040438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1517857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 496.6607360839844, "completions/mean_terminated_length": 219.05264282226562, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4351752698421478, "epoch": 0.016414302074055222, "frac_reward_zero_std": 0.5, "grad_norm": 0.3284876346588135, "kl": 0.25948135554790497, "learning_rate": 9.837129405776817e-06, "loss": 0.1936, "num_tokens": 21865446.0, "reward": 0.49196428060531616, "reward_std": 0.4228873550891876, "rewards/unified_reward/mean": 0.49196428060531616, "rewards/unified_reward/std": 0.4228874146938324, "sampling/importance_sampling_ratio/max": 2.891782283782959, "sampling/importance_sampling_ratio/mean": 0.914946436882019, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4542722702026367, "sampling/sampling_logp_difference/mean": 0.016499821096658707, "step": 129, "step_time": 27.160542531171814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 351.2946472167969, "completions/mean_terminated_length": 128.4949493408203, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.1673942506313324, "epoch": 0.016541544725792084, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.8883553743362427, "kl": 0.40311310440301895, "learning_rate": 9.835856979259449e-06, "loss": 0.1927, "num_tokens": 21928279.0, "reward": 0.5418155193328857, "reward_std": 0.32768484950065613, "rewards/unified_reward/mean": 0.541815459728241, "rewards/unified_reward/std": 0.32768481969833374, "sampling/importance_sampling_ratio/max": 2.407761812210083, "sampling/importance_sampling_ratio/mean": 0.8869686722755432, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5971469879150391, "sampling/sampling_logp_difference/mean": 0.02648005075752735, "step": 130, "step_time": 23.05364525085315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 542.5178833007812, "completions/mean_terminated_length": 131.93182373046875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.806090772151947, "epoch": 0.016668787377528946, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 1.0705453157424927, "kl": 0.31251227110624313, "learning_rate": 9.83458455274208e-06, "loss": 0.0205, "num_tokens": 22013065.0, "reward": 0.4739583730697632, "reward_std": 0.38975095748901367, "rewards/unified_reward/mean": 0.4739583432674408, "rewards/unified_reward/std": 0.38975095748901367, "sampling/importance_sampling_ratio/max": 2.3311870098114014, "sampling/importance_sampling_ratio/mean": 0.8076363801956177, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5657615661621094, "sampling/sampling_logp_difference/mean": 0.023777123540639877, "step": 131, "step_time": 27.131015609251335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.133928582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 427.2232360839844, "completions/mean_terminated_length": 176.58761596679688, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.5020290315151215, "epoch": 0.01679603002926581, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 1.124707579612732, "kl": 0.27577390894293785, "learning_rate": 9.833312126224712e-06, "loss": 0.1745, "num_tokens": 22077698.0, "reward": 0.5383929014205933, "reward_std": 0.4362046420574188, "rewards/unified_reward/mean": 0.5383928418159485, "rewards/unified_reward/std": 0.4362046420574188, "sampling/importance_sampling_ratio/max": 2.5203330516815186, "sampling/importance_sampling_ratio/mean": 0.8785881400108337, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.41141605377197266, "sampling/sampling_logp_difference/mean": 0.020536988973617554, "step": 132, "step_time": 22.648722298210487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 489.08038330078125, "completions/mean_terminated_length": 170.59140014648438, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.9065138697624207, "epoch": 0.01692327268100267, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.45983365178108215, "kl": 0.31683873012661934, "learning_rate": 9.832039699707343e-06, "loss": 0.0501, "num_tokens": 22159099.0, "reward": 0.5580357313156128, "reward_std": 0.35805463790893555, "rewards/unified_reward/mean": 0.5580357313156128, "rewards/unified_reward/std": 0.35805460810661316, "sampling/importance_sampling_ratio/max": 2.625025749206543, "sampling/importance_sampling_ratio/mean": 0.8191603422164917, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48530471324920654, "sampling/sampling_logp_difference/mean": 0.02481505647301674, "step": 133, "step_time": 26.08787261811085 }, { "clip_ratio/high_max": 0.0005726065137423575, "clip_ratio/high_mean": 8.180092845577747e-05, "clip_ratio/low_mean": 4.311148722990765e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00012491241568568512, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 449.4285888671875, "completions/mean_terminated_length": 143.3191375732422, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.0546876341104507, "epoch": 0.017050515332739533, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.587821900844574, "kl": 0.3526753857731819, "learning_rate": 9.830767273189974e-06, "loss": 0.1626, "num_tokens": 22236419.0, "reward": 0.4675595760345459, "reward_std": 0.34991660714149475, "rewards/unified_reward/mean": 0.4675595462322235, "rewards/unified_reward/std": 0.34991663694381714, "sampling/importance_sampling_ratio/max": 2.171826124191284, "sampling/importance_sampling_ratio/mean": 0.9012467265129089, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.45539188385009766, "sampling/sampling_logp_difference/mean": 0.015292014926671982, "step": 134, "step_time": 25.04190014791675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2946428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1702.0, "completions/mean_length": 692.0535888671875, "completions/mean_terminated_length": 125.64557647705078, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 1.6232003271579742, "epoch": 0.017177757984476398, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.5302683115005493, "kl": 0.28142815083265305, "learning_rate": 9.829494846672606e-06, "loss": -0.0973, "num_tokens": 22341441.0, "reward": 0.44434529542922974, "reward_std": 0.3428780138492584, "rewards/unified_reward/mean": 0.44434526562690735, "rewards/unified_reward/std": 0.3428780138492584, "sampling/importance_sampling_ratio/max": 2.8722734451293945, "sampling/importance_sampling_ratio/mean": 0.7667055130004883, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6197196245193481, "sampling/sampling_logp_difference/mean": 0.020157670602202415, "step": 135, "step_time": 29.89285085373558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.208232778386446e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.208232778386446e-05, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1484.0, "completions/mean_length": 713.0803833007812, "completions/mean_terminated_length": 106.29869842529297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7569309771060944, "epoch": 0.01730500063621326, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.30604225397109985, "kl": 0.2085118070244789, "learning_rate": 9.828222420155237e-06, "loss": 0.0156, "num_tokens": 22463050.0, "reward": 0.39717262983322144, "reward_std": 0.39437922835350037, "rewards/unified_reward/mean": 0.39717262983322144, "rewards/unified_reward/std": 0.39437922835350037, "sampling/importance_sampling_ratio/max": 2.4498450756073, "sampling/importance_sampling_ratio/mean": 0.799103319644928, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4806525707244873, "sampling/sampling_logp_difference/mean": 0.01833593100309372, "step": 136, "step_time": 37.578473003348336 }, { "clip_ratio/high_max": 0.0010536165937082842, "clip_ratio/high_mean": 0.00015051665650389623, "clip_ratio/low_mean": 5.8797824749490246e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020931447579641826, "completions/clipped_ratio": 0.392857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 901.0982666015625, "completions/mean_terminated_length": 158.98529052734375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3260311484336853, "epoch": 0.017432243287950122, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3309539258480072, "kl": 0.18161651492118835, "learning_rate": 9.826949993637869e-06, "loss": 0.0933, "num_tokens": 22585573.0, "reward": 0.4537202715873718, "reward_std": 0.3940225839614868, "rewards/unified_reward/mean": 0.4537202715873718, "rewards/unified_reward/std": 0.3940225839614868, "sampling/importance_sampling_ratio/max": 2.799482822418213, "sampling/importance_sampling_ratio/mean": 0.8029465079307556, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4967308044433594, "sampling/sampling_logp_difference/mean": 0.015477697364985943, "step": 137, "step_time": 32.491692658746615 }, { "clip_ratio/high_max": 3.678634311654605e-05, "clip_ratio/high_mean": 5.2551918088283855e-06, "clip_ratio/low_mean": 5.756654445576714e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 1.10118462544051e-05, "completions/clipped_ratio": 0.3660714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 946.3125610351562, "completions/mean_terminated_length": 310.12677001953125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3936481773853302, "epoch": 0.017559485939686984, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.2143641710281372, "kl": 0.17791936546564102, "learning_rate": 9.8256775671205e-06, "loss": -0.0776, "num_tokens": 22710584.0, "reward": 0.3991071581840515, "reward_std": 0.37842753529548645, "rewards/unified_reward/mean": 0.3991071581840515, "rewards/unified_reward/std": 0.37842753529548645, "sampling/importance_sampling_ratio/max": 2.295398473739624, "sampling/importance_sampling_ratio/mean": 0.6590508222579956, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.746468424797058, "sampling/sampling_logp_difference/mean": 0.017982399091124535, "step": 138, "step_time": 31.150952756172046 }, { "clip_ratio/high_max": 0.0009608152322471142, "clip_ratio/high_mean": 0.00015448763224412687, "clip_ratio/low_mean": 4.117562093597371e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019566325318010058, "completions/clipped_ratio": 0.598214328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 1280.7679443359375, "completions/mean_terminated_length": 138.44444274902344, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.6062714755535126, "epoch": 0.017686728591423846, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.1902925670146942, "kl": 0.12315504811704159, "learning_rate": 9.82440514060313e-06, "loss": 0.0505, "num_tokens": 22877902.0, "reward": 0.4806548058986664, "reward_std": 0.35742202401161194, "rewards/unified_reward/mean": 0.480654776096344, "rewards/unified_reward/std": 0.3574220538139343, "sampling/importance_sampling_ratio/max": 2.3487541675567627, "sampling/importance_sampling_ratio/mean": 0.7474222779273987, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49431705474853516, "sampling/sampling_logp_difference/mean": 0.014287960715591908, "step": 139, "step_time": 38.06134281703271 }, { "clip_ratio/high_max": 0.0004857924359384924, "clip_ratio/high_mean": 6.939892136870185e-05, "clip_ratio/low_mean": 6.488239614554914e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00013428131978798774, "completions/clipped_ratio": 0.6428571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 1394.8126220703125, "completions/mean_terminated_length": 219.0749969482422, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 1.506188690662384, "epoch": 0.017813971243160708, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.22131997346878052, "kl": 0.141466049477458, "learning_rate": 9.823132714085763e-06, "loss": -0.1006, "num_tokens": 23062833.0, "reward": 0.2723214328289032, "reward_std": 0.3308810293674469, "rewards/unified_reward/mean": 0.2723214328289032, "rewards/unified_reward/std": 0.3308809995651245, "sampling/importance_sampling_ratio/max": 2.9829394817352295, "sampling/importance_sampling_ratio/mean": 0.7297333478927612, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.640408992767334, "sampling/sampling_logp_difference/mean": 0.01734822429716587, "step": 140, "step_time": 39.25003536604345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 3.3841715776361525e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.3841715776361525e-05, "completions/clipped_ratio": 0.660714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 1426.857177734375, "completions/mean_terminated_length": 217.26315307617188, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.6695854365825653, "epoch": 0.01794121389489757, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.2206861674785614, "kl": 0.14488019980490208, "learning_rate": 9.821860287568393e-06, "loss": -0.0977, "num_tokens": 23253577.0, "reward": 0.3645833730697632, "reward_std": 0.32383716106414795, "rewards/unified_reward/mean": 0.3645833432674408, "rewards/unified_reward/std": 0.32383716106414795, "sampling/importance_sampling_ratio/max": 2.331397771835327, "sampling/importance_sampling_ratio/mean": 0.6242743730545044, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5567405223846436, "sampling/sampling_logp_difference/mean": 0.01792856678366661, "step": 141, "step_time": 44.55273940344341 }, { "clip_ratio/high_max": 0.0008441171812592074, "clip_ratio/high_mean": 0.00012058817083016038, "clip_ratio/low_mean": 3.6019630442751804e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00015660779308745987, "completions/clipped_ratio": 0.5446428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1198.15185546875, "completions/mean_terminated_length": 181.6666717529297, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 1.9082340598106384, "epoch": 0.018068456546634432, "frac_reward_zero_std": 0.5, "grad_norm": 0.36721834540367126, "kl": 0.16681642457842827, "learning_rate": 9.820587861051026e-06, "loss": -0.0915, "num_tokens": 23419066.0, "reward": 0.4074404835700989, "reward_std": 0.4176105558872223, "rewards/unified_reward/mean": 0.4074404537677765, "rewards/unified_reward/std": 0.4176105558872223, "sampling/importance_sampling_ratio/max": 2.5801684856414795, "sampling/importance_sampling_ratio/mean": 0.5855328440666199, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1517958641052246, "sampling/sampling_logp_difference/mean": 0.02088075503706932, "step": 142, "step_time": 39.17192927678116 }, { "clip_ratio/high_max": 3.0517578125e-05, "clip_ratio/high_mean": 4.359654212748865e-06, "clip_ratio/low_mean": 5.667550294674584e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.103515488575795e-05, "completions/clipped_ratio": 0.7500000596046448, "completions/max_length": 2048.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 1576.884033203125, "completions/mean_terminated_length": 163.5357208251953, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.9385441541671753, "epoch": 0.018195699198371294, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.17817124724388123, "kl": 0.15169186145067215, "learning_rate": 9.819315434533655e-06, "loss": 0.0564, "num_tokens": 23617965.0, "reward": 0.35610121488571167, "reward_std": 0.34283745288848877, "rewards/unified_reward/mean": 0.35610121488571167, "rewards/unified_reward/std": 0.34283745288848877, "sampling/importance_sampling_ratio/max": 2.7198145389556885, "sampling/importance_sampling_ratio/mean": 0.6018750071525574, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5374102592468262, "sampling/sampling_logp_difference/mean": 0.019750256091356277, "step": 143, "step_time": 42.8266614198219 }, { "clip_ratio/high_max": 0.0008329392476298381, "clip_ratio/high_mean": 0.00012462360336940037, "clip_ratio/low_mean": 0.00012329854143899865, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024792215208435664, "completions/clipped_ratio": 0.6160714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 1355.02685546875, "completions/mean_terminated_length": 243.0465087890625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4292809069156647, "epoch": 0.018322941850108156, "frac_reward_zero_std": 0.5, "grad_norm": 0.28087183833122253, "kl": 0.11732654459774494, "learning_rate": 9.818043008016288e-06, "loss": -0.0321, "num_tokens": 23794544.0, "reward": 0.4053571820259094, "reward_std": 0.41706714034080505, "rewards/unified_reward/mean": 0.40535715222358704, "rewards/unified_reward/std": 0.41706714034080505, "sampling/importance_sampling_ratio/max": 2.920947313308716, "sampling/importance_sampling_ratio/mean": 0.6723548173904419, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6103838682174683, "sampling/sampling_logp_difference/mean": 0.015769964084029198, "step": 144, "step_time": 39.15078723686747 }, { "clip_ratio/high_max": 0.001442123466404155, "clip_ratio/high_mean": 0.00024240829225163907, "clip_ratio/low_mean": 0.00012111257365177153, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003635208704508841, "completions/clipped_ratio": 0.660714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 1413.0804443359375, "completions/mean_terminated_length": 176.65789794921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3733888566493988, "epoch": 0.01845018450184502, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.20074079930782318, "kl": 0.159144951030612, "learning_rate": 9.816770581498918e-06, "loss": -0.0456, "num_tokens": 23978929.0, "reward": 0.35089290142059326, "reward_std": 0.37938520312309265, "rewards/unified_reward/mean": 0.3508928716182709, "rewards/unified_reward/std": 0.37938520312309265, "sampling/importance_sampling_ratio/max": 2.0524508953094482, "sampling/importance_sampling_ratio/mean": 0.6938287019729614, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48116135597229004, "sampling/sampling_logp_difference/mean": 0.016958992928266525, "step": 145, "step_time": 39.59496482368559 }, { "clip_ratio/high_max": 0.002459934097714722, "clip_ratio/high_mean": 0.00040373500451096334, "clip_ratio/low_mean": 7.847377401049016e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004822087721549906, "completions/clipped_ratio": 0.7321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 1530.7410888671875, "completions/mean_terminated_length": 116.90000915527344, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.5065445005893707, "epoch": 0.01857742715358188, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.40229007601737976, "kl": 0.16216934844851494, "learning_rate": 9.815498154981551e-06, "loss": 0.0444, "num_tokens": 24170644.0, "reward": 0.5139881372451782, "reward_std": 0.36017704010009766, "rewards/unified_reward/mean": 0.5139880776405334, "rewards/unified_reward/std": 0.36017704010009766, "sampling/importance_sampling_ratio/max": 2.997115135192871, "sampling/importance_sampling_ratio/mean": 0.657873809337616, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.77317214012146, "sampling/sampling_logp_difference/mean": 0.0191934946924448, "step": 146, "step_time": 40.5605643640738 }, { "clip_ratio/high_max": 0.0006288114527706057, "clip_ratio/high_mean": 0.00010139758614968741, "clip_ratio/low_mean": 0.0004378017329145223, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005391993254306726, "completions/clipped_ratio": 0.598214328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 1295.125, "completions/mean_terminated_length": 174.1777801513672, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.7615625858306885, "epoch": 0.018704669805318742, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.18114891648292542, "kl": 0.15929753333330154, "learning_rate": 9.814225728464181e-06, "loss": -0.0089, "num_tokens": 24392002.0, "reward": 0.5125000476837158, "reward_std": 0.4200463593006134, "rewards/unified_reward/mean": 0.512499988079071, "rewards/unified_reward/std": 0.4200463593006134, "sampling/importance_sampling_ratio/max": 2.19480299949646, "sampling/importance_sampling_ratio/mean": 0.595892608165741, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6126213073730469, "sampling/sampling_logp_difference/mean": 0.020235426723957062, "step": 147, "step_time": 60.331882876344025 }, { "clip_ratio/high_max": 0.0015902843442745507, "clip_ratio/high_mean": 0.00027355062411515974, "clip_ratio/low_mean": 0.0002102411599480547, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0004837917585973628, "completions/clipped_ratio": 0.7410714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 1568.33935546875, "completions/mean_terminated_length": 195.51724243164062, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6156274676322937, "epoch": 0.018831912457055604, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.16800449788570404, "kl": 0.14917899295687675, "learning_rate": 9.812953301946814e-06, "loss": 0.003, "num_tokens": 24589752.0, "reward": 0.3705357313156128, "reward_std": 0.3626105487346649, "rewards/unified_reward/mean": 0.3705357015132904, "rewards/unified_reward/std": 0.3626105487346649, "sampling/importance_sampling_ratio/max": 2.1905622482299805, "sampling/importance_sampling_ratio/mean": 0.4771307110786438, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4706757068634033, "sampling/sampling_logp_difference/mean": 0.020194295793771744, "step": 148, "step_time": 41.62218208378181 }, { "clip_ratio/high_max": 0.0012128857924835756, "clip_ratio/high_mean": 0.0001732693945086794, "clip_ratio/low_mean": 7.38716034902609e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002471409970894456, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 1328.259033203125, "completions/mean_terminated_length": 128.6904754638672, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 1.7037026584148407, "epoch": 0.018959155108792466, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.19778665900230408, "kl": 0.13414669036865234, "learning_rate": 9.811680875429444e-06, "loss": -0.0473, "num_tokens": 24756509.0, "reward": 0.42604169249534607, "reward_std": 0.40542975068092346, "rewards/unified_reward/mean": 0.4260416626930237, "rewards/unified_reward/std": 0.40542975068092346, "sampling/importance_sampling_ratio/max": 1.9641642570495605, "sampling/importance_sampling_ratio/mean": 0.6134362816810608, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6846427917480469, "sampling/sampling_logp_difference/mean": 0.01975112408399582, "step": 149, "step_time": 37.67135197529569 }, { "clip_ratio/high_max": 0.002774098189547658, "clip_ratio/high_mean": 0.000435536632721778, "clip_ratio/low_mean": 6.709770332236076e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005026343424106017, "completions/clipped_ratio": 0.6696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 1424.696533203125, "completions/mean_terminated_length": 161.2432403564453, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.916651725769043, "epoch": 0.01908639776052933, "frac_reward_zero_std": 0.5, "grad_norm": 0.1612670123577118, "kl": 0.1906207948923111, "learning_rate": 9.810408448912077e-06, "loss": 0.0017, "num_tokens": 24937035.0, "reward": 0.41562503576278687, "reward_std": 0.2937987744808197, "rewards/unified_reward/mean": 0.41562503576278687, "rewards/unified_reward/std": 0.2937988042831421, "sampling/importance_sampling_ratio/max": 2.480292320251465, "sampling/importance_sampling_ratio/mean": 0.4806610643863678, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7212672233581543, "sampling/sampling_logp_difference/mean": 0.02268386073410511, "step": 150, "step_time": 39.425229515647516 }, { "clip_ratio/high_max": 0.0013427734375, "clip_ratio/high_mean": 0.00023959243117133155, "clip_ratio/low_mean": 0.00013531195145333186, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003749043826246634, "completions/clipped_ratio": 0.598214328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1336.5179443359375, "completions/mean_terminated_length": 277.20001220703125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 1.5806286931037903, "epoch": 0.01921364041226619, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.14131765067577362, "kl": 0.13581206649541855, "learning_rate": 9.809136022394707e-06, "loss": -0.0423, "num_tokens": 25126653.0, "reward": 0.4590774178504944, "reward_std": 0.42843666672706604, "rewards/unified_reward/mean": 0.459077388048172, "rewards/unified_reward/std": 0.4284366965293884, "sampling/importance_sampling_ratio/max": 2.9506473541259766, "sampling/importance_sampling_ratio/mean": 0.529996395111084, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48863887786865234, "sampling/sampling_logp_difference/mean": 0.020514795556664467, "step": 151, "step_time": 44.62971600797027 }, { "clip_ratio/high_max": 0.0020812048169318587, "clip_ratio/high_mean": 0.00041167409654008225, "clip_ratio/low_mean": 0.00011282565901638009, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005244997446425259, "completions/clipped_ratio": 0.6785714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 1455.169677734375, "completions/mean_terminated_length": 203.63888549804688, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 1.815227210521698, "epoch": 0.019340883064003053, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.48761415481567383, "kl": 0.14513910934329033, "learning_rate": 9.80786359587734e-06, "loss": -0.0541, "num_tokens": 25307456.0, "reward": 0.39404764771461487, "reward_std": 0.37747421860694885, "rewards/unified_reward/mean": 0.39404764771461487, "rewards/unified_reward/std": 0.37747421860694885, "sampling/importance_sampling_ratio/max": 2.8877816200256348, "sampling/importance_sampling_ratio/mean": 0.633915901184082, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6595743894577026, "sampling/sampling_logp_difference/mean": 0.019987106323242188, "step": 152, "step_time": 38.639632082078606 }, { "clip_ratio/high_max": 0.001983642578125, "clip_ratio/high_mean": 0.00038364954889402725, "clip_ratio/low_mean": 0.00013019024390814593, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005138397900736891, "completions/clipped_ratio": 0.6785714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 1451.08935546875, "completions/mean_terminated_length": 190.94444274902344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "entropy": 1.7780422270298004, "epoch": 0.019468125715739915, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.28930073976516724, "kl": 0.19500989094376564, "learning_rate": 9.80659116935997e-06, "loss": -0.0023, "num_tokens": 25487586.0, "reward": 0.47529762983322144, "reward_std": 0.3225817382335663, "rewards/unified_reward/mean": 0.47529762983322144, "rewards/unified_reward/std": 0.3225817084312439, "sampling/importance_sampling_ratio/max": 1.8382900953292847, "sampling/importance_sampling_ratio/mean": 0.5741323232650757, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4750089645385742, "sampling/sampling_logp_difference/mean": 0.02152342163026333, "step": 153, "step_time": 38.94907058309764 }, { "clip_ratio/high_max": 0.005348612874513492, "clip_ratio/high_mean": 0.0007808837171978666, "clip_ratio/low_mean": 0.00014937870582798496, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009302624457632191, "completions/clipped_ratio": 0.660714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 1384.40185546875, "completions/mean_terminated_length": 92.13157653808594, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.9211896657943726, "epoch": 0.019595368367476777, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.1514924019575119, "kl": 0.1376328356564045, "learning_rate": 9.8053187428426e-06, "loss": -0.0494, "num_tokens": 25672767.0, "reward": 0.35133928060531616, "reward_std": 0.31869620084762573, "rewards/unified_reward/mean": 0.35133928060531616, "rewards/unified_reward/std": 0.31869617104530334, "sampling/importance_sampling_ratio/max": 2.64491605758667, "sampling/importance_sampling_ratio/mean": 0.5224066376686096, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5192563533782959, "sampling/sampling_logp_difference/mean": 0.021568099036812782, "step": 154, "step_time": 43.67903907294385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 4.605124695444829e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 4.605124695444829e-05, "completions/clipped_ratio": 0.6339285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 1359.044677734375, "completions/mean_terminated_length": 165.97560119628906, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 1.8292428255081177, "epoch": 0.019722611019213642, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.1925336718559265, "kl": 0.15117113664746284, "learning_rate": 9.804046316325234e-06, "loss": 0.0201, "num_tokens": 25850036.0, "reward": 0.31636905670166016, "reward_std": 0.3578798770904541, "rewards/unified_reward/mean": 0.31636905670166016, "rewards/unified_reward/std": 0.3578798770904541, "sampling/importance_sampling_ratio/max": 2.6887965202331543, "sampling/importance_sampling_ratio/mean": 0.6484972238540649, "sampling/importance_sampling_ratio/min": 0.000626571592874825, "sampling/sampling_logp_difference/max": 0.8809545040130615, "sampling/sampling_logp_difference/mean": 0.020700804889202118, "step": 155, "step_time": 39.22703180415556 }, { "clip_ratio/high_max": 0.0011564529049792327, "clip_ratio/high_mean": 0.00016956721174210543, "clip_ratio/low_mean": 0.00017874581135401968, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000348313020367641, "completions/clipped_ratio": 0.7500000596046448, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1614.90185546875, "completions/mean_terminated_length": 315.6071472167969, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 2.0992585718631744, "epoch": 0.019849853670950504, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.20692768692970276, "kl": 0.16840537264943123, "learning_rate": 9.802773889807864e-06, "loss": 0.0381, "num_tokens": 26050217.0, "reward": 0.3739583194255829, "reward_std": 0.37051475048065186, "rewards/unified_reward/mean": 0.3739583194255829, "rewards/unified_reward/std": 0.37051475048065186, "sampling/importance_sampling_ratio/max": 2.9018197059631348, "sampling/importance_sampling_ratio/mean": 0.43995222449302673, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49593496322631836, "sampling/sampling_logp_difference/mean": 0.02294042520225048, "step": 156, "step_time": 42.8979241787456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6696428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 1423.696533203125, "completions/mean_terminated_length": 158.21621704101562, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6723560690879822, "epoch": 0.019977096322687366, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.0811178907752037, "kl": 0.16335345804691315, "learning_rate": 9.801501463290497e-06, "loss": 0.0043, "num_tokens": 26258847.0, "reward": 0.30892857909202576, "reward_std": 0.3030288517475128, "rewards/unified_reward/mean": 0.30892854928970337, "rewards/unified_reward/std": 0.30302882194519043, "sampling/importance_sampling_ratio/max": 2.467679023742676, "sampling/importance_sampling_ratio/mean": 0.616638720035553, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7192873954772949, "sampling/sampling_logp_difference/mean": 0.02090393379330635, "step": 157, "step_time": 58.97587332385592 }, { "clip_ratio/high_max": 0.0008544353186152875, "clip_ratio/high_mean": 0.00012206218525534496, "clip_ratio/low_mean": 9.821168623602716e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002202738678533933, "completions/clipped_ratio": 0.6517857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 1476.65185546875, "completions/mean_terminated_length": 407.20513916015625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 1.8443779051303864, "epoch": 0.020104338974424228, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.29931676387786865, "kl": 0.17307845130562782, "learning_rate": 9.800229036773126e-06, "loss": 0.0299, "num_tokens": 26449160.0, "reward": 0.4174107313156128, "reward_std": 0.3478044271469116, "rewards/unified_reward/mean": 0.4174107015132904, "rewards/unified_reward/std": 0.34780439734458923, "sampling/importance_sampling_ratio/max": 2.742306709289551, "sampling/importance_sampling_ratio/mean": 0.546108603477478, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7455110549926758, "sampling/sampling_logp_difference/mean": 0.022077403962612152, "step": 158, "step_time": 40.84042826690711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6160714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1369.8482666015625, "completions/mean_terminated_length": 281.6511535644531, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9369060695171356, "epoch": 0.02023158162616109, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.34930822253227234, "kl": 0.2693096660077572, "learning_rate": 9.79895661025576e-06, "loss": 0.0219, "num_tokens": 26628479.0, "reward": 0.6342262029647827, "reward_std": 0.41711291670799255, "rewards/unified_reward/mean": 0.6342262029647827, "rewards/unified_reward/std": 0.41711288690567017, "sampling/importance_sampling_ratio/max": 2.5140936374664307, "sampling/importance_sampling_ratio/mean": 0.6189820170402527, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5656318664550781, "sampling/sampling_logp_difference/mean": 0.023249592632055283, "step": 159, "step_time": 39.02323696715757 }, { "clip_ratio/high_max": 0.0019579052241169848, "clip_ratio/high_mean": 0.000279700740975386, "clip_ratio/low_mean": 1.379813147650566e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029349886790441815, "completions/clipped_ratio": 0.660714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 1450.33935546875, "completions/mean_terminated_length": 286.47369384765625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.8126018047332764, "epoch": 0.020358824277897952, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.11005160212516785, "kl": 0.17803282476961613, "learning_rate": 9.79768418373839e-06, "loss": -0.0411, "num_tokens": 26820629.0, "reward": 0.4471726715564728, "reward_std": 0.3726772367954254, "rewards/unified_reward/mean": 0.4471726417541504, "rewards/unified_reward/std": 0.3726772367954254, "sampling/importance_sampling_ratio/max": 1.9581712484359741, "sampling/importance_sampling_ratio/mean": 0.5467262864112854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0603442192077637, "sampling/sampling_logp_difference/mean": 0.020377496257424355, "step": 160, "step_time": 41.7360419628676 }, { "clip_ratio/high_max": 0.0008967548128566705, "clip_ratio/high_mean": 0.00012810783027816797, "clip_ratio/low_mean": 1.3078962183499243e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001411867933711619, "completions/clipped_ratio": 0.6964285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 1502.33935546875, "completions/mean_terminated_length": 250.5294189453125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "entropy": 2.0826189815998077, "epoch": 0.020486066929634814, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.2263454794883728, "kl": 0.16797343641519547, "learning_rate": 9.796411757221022e-06, "loss": 0.0416, "num_tokens": 27031979.0, "reward": 0.2633928954601288, "reward_std": 0.2656402885913849, "rewards/unified_reward/mean": 0.2633928656578064, "rewards/unified_reward/std": 0.2656402885913849, "sampling/importance_sampling_ratio/max": 2.9804069995880127, "sampling/importance_sampling_ratio/mean": 0.60013347864151, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0487778186798096, "sampling/sampling_logp_difference/mean": 0.022163545712828636, "step": 161, "step_time": 49.620246432721615 }, { "clip_ratio/high_max": 0.0032153388019651175, "clip_ratio/high_mean": 0.0006485697813332081, "clip_ratio/low_mean": 2.0307336399127962e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006688771391054615, "completions/clipped_ratio": 0.5803571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 1268.77685546875, "completions/mean_terminated_length": 191.12765502929688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.9063501358032227, "epoch": 0.020613309581371676, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.382066011428833, "kl": 0.24353952333331108, "learning_rate": 9.795139330703652e-06, "loss": 0.0891, "num_tokens": 27217090.0, "reward": 0.4407738447189331, "reward_std": 0.38298940658569336, "rewards/unified_reward/mean": 0.4407738149166107, "rewards/unified_reward/std": 0.38298937678337097, "sampling/importance_sampling_ratio/max": 2.410264015197754, "sampling/importance_sampling_ratio/mean": 0.620401918888092, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6710200309753418, "sampling/sampling_logp_difference/mean": 0.021872185170650482, "step": 162, "step_time": 45.62601582100615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00010536375702940859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00010536375702940859, "completions/clipped_ratio": 0.5446428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 1175.9285888671875, "completions/mean_terminated_length": 132.8627471923828, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.9465121924877167, "epoch": 0.02074055223310854, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.30484524369239807, "kl": 0.20778966322541237, "learning_rate": 9.793866904186285e-06, "loss": 0.0066, "num_tokens": 27383386.0, "reward": 0.4315476715564728, "reward_std": 0.3826528489589691, "rewards/unified_reward/mean": 0.4315476417541504, "rewards/unified_reward/std": 0.3826528489589691, "sampling/importance_sampling_ratio/max": 2.5108087062835693, "sampling/importance_sampling_ratio/mean": 0.6679743528366089, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4208066463470459, "sampling/sampling_logp_difference/mean": 0.02323893830180168, "step": 163, "step_time": 40.06351502821781 }, { "clip_ratio/high_max": 0.0004452655848581344, "clip_ratio/high_mean": 6.360936822602525e-05, "clip_ratio/low_mean": 8.686997171025723e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001504793472122401, "completions/clipped_ratio": 0.7053571939468384, "completions/max_length": 2048.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 1508.4285888671875, "completions/mean_terminated_length": 216.72727966308594, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0856508910655975, "epoch": 0.0208677948848454, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.16178986430168152, "kl": 0.15191974490880966, "learning_rate": 9.792594477668915e-06, "loss": -0.0227, "num_tokens": 27580018.0, "reward": 0.3202381134033203, "reward_std": 0.3285575807094574, "rewards/unified_reward/mean": 0.3202381134033203, "rewards/unified_reward/std": 0.3285575807094574, "sampling/importance_sampling_ratio/max": 2.7220206260681152, "sampling/importance_sampling_ratio/mean": 0.5897669196128845, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48006463050842285, "sampling/sampling_logp_difference/mean": 0.022554125636816025, "step": 164, "step_time": 41.939740031724796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 9.814715667744167e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 9.814715667744167e-06, "completions/clipped_ratio": 0.5892857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 1292.4107666015625, "completions/mean_terminated_length": 208.30435180664062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.7781542837619781, "epoch": 0.020995037536582262, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.2422775775194168, "kl": 0.1320081613957882, "learning_rate": 9.791322051151548e-06, "loss": 0.0076, "num_tokens": 27743584.0, "reward": 0.5778273940086365, "reward_std": 0.409993976354599, "rewards/unified_reward/mean": 0.5778273940086365, "rewards/unified_reward/std": 0.4099939465522766, "sampling/importance_sampling_ratio/max": 2.1662240028381348, "sampling/importance_sampling_ratio/mean": 0.6849789619445801, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7888422012329102, "sampling/sampling_logp_difference/mean": 0.02012511156499386, "step": 165, "step_time": 37.72093509952538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.679753030563006e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.679753030563006e-06, "completions/clipped_ratio": 0.723214328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 1565.8482666015625, "completions/mean_terminated_length": 306.0322570800781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.9334682524204254, "epoch": 0.021122280188319124, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.1970282644033432, "kl": 0.13537670113146305, "learning_rate": 9.790049624634178e-06, "loss": 0.0061, "num_tokens": 27945247.0, "reward": 0.3742559850215912, "reward_std": 0.37711626291275024, "rewards/unified_reward/mean": 0.3742559552192688, "rewards/unified_reward/std": 0.37711623311042786, "sampling/importance_sampling_ratio/max": 2.7272214889526367, "sampling/importance_sampling_ratio/mean": 0.568086564540863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0628271102905273, "sampling/sampling_logp_difference/mean": 0.01993260718882084, "step": 166, "step_time": 41.46945038624108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5714285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 1231.946533203125, "completions/mean_terminated_length": 143.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.822640597820282, "epoch": 0.021249522840055986, "frac_reward_zero_std": 0.6428571939468384, "grad_norm": 0.18952572345733643, "kl": 0.29749997705221176, "learning_rate": 9.78877719811681e-06, "loss": -0.014, "num_tokens": 28111505.0, "reward": 0.5535714626312256, "reward_std": 0.34529170393943787, "rewards/unified_reward/mean": 0.5535714626312256, "rewards/unified_reward/std": 0.34529173374176025, "sampling/importance_sampling_ratio/max": 2.059072256088257, "sampling/importance_sampling_ratio/mean": 0.6713743209838867, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4495682716369629, "sampling/sampling_logp_difference/mean": 0.020644966512918472, "step": 167, "step_time": 38.252501365030184 }, { "clip_ratio/high_max": 3.909915540134534e-05, "clip_ratio/high_mean": 5.585593498835806e-06, "clip_ratio/low_mean": 6.755822641935083e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.314381991818664e-05, "completions/clipped_ratio": 0.4821428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 1069.75, "completions/mean_terminated_length": 158.96551513671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.676735907793045, "epoch": 0.02137676549179285, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.39625588059425354, "kl": 0.18527455627918243, "learning_rate": 9.78750477159944e-06, "loss": 0.0719, "num_tokens": 28250453.0, "reward": 0.45654764771461487, "reward_std": 0.3651439845561981, "rewards/unified_reward/mean": 0.45654764771461487, "rewards/unified_reward/std": 0.3651439845561981, "sampling/importance_sampling_ratio/max": 2.838717460632324, "sampling/importance_sampling_ratio/mean": 0.8129587173461914, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0571320056915283, "sampling/sampling_logp_difference/mean": 0.020969580858945847, "step": 168, "step_time": 33.181701021967456 }, { "clip_ratio/high_max": 0.0009539920720271766, "clip_ratio/high_mean": 0.00015939314107527025, "clip_ratio/low_mean": 1.594305149410502e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00017533619757159613, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 1255.6160888671875, "completions/mean_terminated_length": 236.83673095703125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6324958503246307, "epoch": 0.02150400814352971, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.1672660857439041, "kl": 0.13471604883670807, "learning_rate": 9.786232345082072e-06, "loss": -0.0258, "num_tokens": 28413642.0, "reward": 0.41577383875846863, "reward_std": 0.38313964009284973, "rewards/unified_reward/mean": 0.41577383875846863, "rewards/unified_reward/std": 0.38313964009284973, "sampling/importance_sampling_ratio/max": 2.9738264083862305, "sampling/importance_sampling_ratio/mean": 0.6674231290817261, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.41344285011291504, "sampling/sampling_logp_difference/mean": 0.018661223351955414, "step": 169, "step_time": 37.32131469412707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 9.987816974899033e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 9.987816974899033e-05, "completions/clipped_ratio": 0.5089285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 1103.9554443359375, "completions/mean_terminated_length": 125.58181762695312, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6112522780895233, "epoch": 0.021631250795266573, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.28015947341918945, "kl": 0.14220467768609524, "learning_rate": 9.784959918564703e-06, "loss": -0.1595, "num_tokens": 28594981.0, "reward": 0.46175599098205566, "reward_std": 0.41967761516571045, "rewards/unified_reward/mean": 0.4617559611797333, "rewards/unified_reward/std": 0.41967761516571045, "sampling/importance_sampling_ratio/max": 2.0709047317504883, "sampling/importance_sampling_ratio/mean": 0.7182679176330566, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5955572128295898, "sampling/sampling_logp_difference/mean": 0.017628448083996773, "step": 170, "step_time": 53.64100085804239 }, { "clip_ratio/high_max": 0.0001840942568378523, "clip_ratio/high_mean": 2.6299179808120243e-05, "clip_ratio/low_mean": 5.489157683769008e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.119075664581032e-05, "completions/clipped_ratio": 0.5267857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 1171.982177734375, "completions/mean_terminated_length": 196.79244995117188, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.9258354008197784, "epoch": 0.021758493447003435, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.19509854912757874, "kl": 0.19301388040184975, "learning_rate": 9.783687492047335e-06, "loss": 0.0183, "num_tokens": 28749363.0, "reward": 0.4583333730697632, "reward_std": 0.40640518069267273, "rewards/unified_reward/mean": 0.4583333432674408, "rewards/unified_reward/std": 0.40640515089035034, "sampling/importance_sampling_ratio/max": 2.2421305179595947, "sampling/importance_sampling_ratio/mean": 0.6103391647338867, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48740148544311523, "sampling/sampling_logp_difference/mean": 0.021301647648215294, "step": 171, "step_time": 36.3509052360896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.5703538995003328e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.5703538995003328e-05, "completions/clipped_ratio": 0.5267857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 1172.15185546875, "completions/mean_terminated_length": 197.15093994140625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.3847096264362335, "epoch": 0.021885736098740297, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.21518611907958984, "kl": 0.1305698473006487, "learning_rate": 9.782415065529966e-06, "loss": -0.0174, "num_tokens": 28903756.0, "reward": 0.4412202835083008, "reward_std": 0.41548454761505127, "rewards/unified_reward/mean": 0.4412202537059784, "rewards/unified_reward/std": 0.41548454761505127, "sampling/importance_sampling_ratio/max": 1.8826851844787598, "sampling/importance_sampling_ratio/mean": 0.7565522789955139, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6718671321868896, "sampling/sampling_logp_difference/mean": 0.014721322804689407, "step": 172, "step_time": 36.80002482002601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 6.318249779724283e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.318249779724283e-05, "completions/clipped_ratio": 0.4642857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 1117.732177734375, "completions/mean_terminated_length": 311.5000305175781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.27302747964859, "epoch": 0.02201297875047716, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.502987265586853, "kl": 0.1766446903347969, "learning_rate": 9.781142639012598e-06, "loss": 0.0094, "num_tokens": 29054198.0, "reward": 0.18095238506793976, "reward_std": 0.2875969409942627, "rewards/unified_reward/mean": 0.18095238506793976, "rewards/unified_reward/std": 0.2875969409942627, "sampling/importance_sampling_ratio/max": 1.9298436641693115, "sampling/importance_sampling_ratio/mean": 0.6212033629417419, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6098341941833496, "sampling/sampling_logp_difference/mean": 0.024579457938671112, "step": 173, "step_time": 33.98450555698946 }, { "clip_ratio/high_max": 6.103515625e-05, "clip_ratio/high_mean": 8.71930842549773e-06, "clip_ratio/low_mean": 2.4267538719868753e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.298684714536648e-05, "completions/clipped_ratio": 0.4642857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 1046.3125, "completions/mean_terminated_length": 178.183349609375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8355222642421722, "epoch": 0.02214022140221402, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.28304505348205566, "kl": 0.31729546189308167, "learning_rate": 9.779870212495229e-06, "loss": 0.0581, "num_tokens": 29211265.0, "reward": 0.6696428656578064, "reward_std": 0.3531866669654846, "rewards/unified_reward/mean": 0.6696428656578064, "rewards/unified_reward/std": 0.3531866669654846, "sampling/importance_sampling_ratio/max": 2.8250203132629395, "sampling/importance_sampling_ratio/mean": 0.7778621315956116, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.45968127250671387, "sampling/sampling_logp_difference/mean": 0.019754935055971146, "step": 174, "step_time": 45.006357067031786 }, { "clip_ratio/high_max": 0.0009317617514170706, "clip_ratio/high_mean": 0.00013310882422956638, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00013310882422956638, "completions/clipped_ratio": 0.3392857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 803.6607666015625, "completions/mean_terminated_length": 164.67567443847656, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9093267321586609, "epoch": 0.022267464053950883, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.1472458690404892, "kl": 0.3415859006345272, "learning_rate": 9.77859778597786e-06, "loss": 0.0267, "num_tokens": 29327043.0, "reward": 0.42038694024086, "reward_std": 0.3148772716522217, "rewards/unified_reward/mean": 0.42038694024086, "rewards/unified_reward/std": 0.3148772716522217, "sampling/importance_sampling_ratio/max": 2.1953132152557373, "sampling/importance_sampling_ratio/mean": 0.7732634544372559, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4602682590484619, "sampling/sampling_logp_difference/mean": 0.02079848013818264, "step": 175, "step_time": 31.58134251506999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 730.0089721679688, "completions/mean_terminated_length": 202.8125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7040340602397919, "epoch": 0.022394706705687748, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.18945160508155823, "kl": 0.18619401939213276, "learning_rate": 9.777325359460492e-06, "loss": 0.0938, "num_tokens": 29431652.0, "reward": 0.5654762387275696, "reward_std": 0.4022939205169678, "rewards/unified_reward/mean": 0.5654762387275696, "rewards/unified_reward/std": 0.40229395031929016, "sampling/importance_sampling_ratio/max": 2.219324827194214, "sampling/importance_sampling_ratio/mean": 0.7419490218162537, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4870462417602539, "sampling/sampling_logp_difference/mean": 0.019100813195109367, "step": 176, "step_time": 29.594138576183468 }, { "clip_ratio/high_max": 0.0011260530445724726, "clip_ratio/high_mean": 0.00016086472169263288, "clip_ratio/low_mean": 3.2508808089914965e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001933735275088111, "completions/clipped_ratio": 0.4910714626312256, "completions/max_length": 2048.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 1151.6429443359375, "completions/mean_terminated_length": 286.7368469238281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5678434073925018, "epoch": 0.02252194935742461, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.16950789093971252, "kl": 0.2900521494448185, "learning_rate": 9.776052932943123e-06, "loss": 0.0649, "num_tokens": 29594124.0, "reward": 0.5406250357627869, "reward_std": 0.35636481642723083, "rewards/unified_reward/mean": 0.5406249761581421, "rewards/unified_reward/std": 0.3563648462295532, "sampling/importance_sampling_ratio/max": 2.592552661895752, "sampling/importance_sampling_ratio/mean": 0.7131974101066589, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.675173282623291, "sampling/sampling_logp_difference/mean": 0.018238572403788567, "step": 177, "step_time": 38.38520862092264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 4.054594410263235e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 4.054594410263235e-05, "completions/clipped_ratio": 0.2946428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 704.2678833007812, "completions/mean_terminated_length": 142.9620361328125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.0715426802635193, "epoch": 0.022649192009161472, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.30758246779441833, "kl": 0.2366483397781849, "learning_rate": 9.774780506425755e-06, "loss": 0.0134, "num_tokens": 29701698.0, "reward": 0.5110119581222534, "reward_std": 0.373272180557251, "rewards/unified_reward/mean": 0.5110118985176086, "rewards/unified_reward/std": 0.3732721507549286, "sampling/importance_sampling_ratio/max": 2.199782371520996, "sampling/importance_sampling_ratio/mean": 0.7675558924674988, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6146717071533203, "sampling/sampling_logp_difference/mean": 0.024146797135472298, "step": 178, "step_time": 30.74013420008123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.1798270608996972e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.1798270608996972e-05, "completions/clipped_ratio": 0.330357164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 800.4464721679688, "completions/mean_terminated_length": 184.98666381835938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.685228407382965, "epoch": 0.022776434660898334, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3728659451007843, "kl": 0.2316359542310238, "learning_rate": 9.773508079908386e-06, "loss": -0.0443, "num_tokens": 29809068.0, "reward": 0.48571428656578064, "reward_std": 0.34747666120529175, "rewards/unified_reward/mean": 0.48571425676345825, "rewards/unified_reward/std": 0.34747666120529175, "sampling/importance_sampling_ratio/max": 2.036663770675659, "sampling/importance_sampling_ratio/mean": 0.7554137110710144, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5271310806274414, "sampling/sampling_logp_difference/mean": 0.0196585264056921, "step": 179, "step_time": 30.045146745163947 }, { "clip_ratio/high_max": 0.0015781573820277117, "clip_ratio/high_mean": 0.00024106727687467355, "clip_ratio/low_mean": 5.794951448478969e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024686222786840517, "completions/clipped_ratio": 0.3214285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1565.0, "completions/mean_length": 822.169677734375, "completions/mean_terminated_length": 241.51315307617188, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8361863493919373, "epoch": 0.022903677312635196, "frac_reward_zero_std": 0.0, "grad_norm": 0.30433928966522217, "kl": 0.14840975403785706, "learning_rate": 9.772235653391017e-06, "loss": 0.0408, "num_tokens": 29926415.0, "reward": 0.4166666865348816, "reward_std": 0.3668440878391266, "rewards/unified_reward/mean": 0.4166666865348816, "rewards/unified_reward/std": 0.3668440878391266, "sampling/importance_sampling_ratio/max": 2.9467170238494873, "sampling/importance_sampling_ratio/mean": 0.8026303052902222, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7219295501708984, "sampling/sampling_logp_difference/mean": 0.02106248028576374, "step": 180, "step_time": 33.854425145545974 }, { "clip_ratio/high_max": 7.106908014975488e-05, "clip_ratio/high_mean": 1.0152725735679269e-05, "clip_ratio/low_mean": 4.359654212748865e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 1.4512379948428134e-05, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 1106.71435546875, "completions/mean_terminated_length": 165.42857360839844, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.5077553391456604, "epoch": 0.02303091996437206, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.30119746923446655, "kl": 0.15774734318256378, "learning_rate": 9.770963226873649e-06, "loss": 0.0763, "num_tokens": 30081031.0, "reward": 0.37187501788139343, "reward_std": 0.37746208906173706, "rewards/unified_reward/mean": 0.37187501788139343, "rewards/unified_reward/std": 0.37746211886405945, "sampling/importance_sampling_ratio/max": 2.9115161895751953, "sampling/importance_sampling_ratio/mean": 0.6543798446655273, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4873490333557129, "sampling/sampling_logp_difference/mean": 0.023097192868590355, "step": 181, "step_time": 39.26345537905581 }, { "clip_ratio/high_max": 0.0006738805459463038, "clip_ratio/high_mean": 9.626864630263299e-05, "clip_ratio/low_mean": 5.151346613274654e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00010141999337065499, "completions/clipped_ratio": 0.4107142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1377.0, "completions/mean_length": 1002.0803833007812, "completions/mean_terminated_length": 273.1060791015625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.9487312138080597, "epoch": 0.02315816261610892, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3077857494354248, "kl": 0.133964778855443, "learning_rate": 9.76969080035628e-06, "loss": 0.0543, "num_tokens": 30215920.0, "reward": 0.4970238506793976, "reward_std": 0.38780340552330017, "rewards/unified_reward/mean": 0.4970238208770752, "rewards/unified_reward/std": 0.3878033757209778, "sampling/importance_sampling_ratio/max": 2.9899375438690186, "sampling/importance_sampling_ratio/mean": 0.7515705823898315, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5661840438842773, "sampling/sampling_logp_difference/mean": 0.019877970218658447, "step": 182, "step_time": 32.43341347691603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3839285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 951.5982666015625, "completions/mean_terminated_length": 268.3333435058594, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.467532306909561, "epoch": 0.023285405267845782, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.5354014039039612, "kl": 0.16532335430383682, "learning_rate": 9.768418373838912e-06, "loss": 0.0171, "num_tokens": 30343083.0, "reward": 0.5270833969116211, "reward_std": 0.36664703488349915, "rewards/unified_reward/mean": 0.5270833373069763, "rewards/unified_reward/std": 0.36664703488349915, "sampling/importance_sampling_ratio/max": 2.777941942214966, "sampling/importance_sampling_ratio/mean": 0.6774060130119324, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49596261978149414, "sampling/sampling_logp_difference/mean": 0.024306638166308403, "step": 183, "step_time": 31.282523727975786 }, { "clip_ratio/high_max": 0.000335693359375, "clip_ratio/high_mean": 4.795619497599546e-05, "clip_ratio/low_mean": 4.359654212748865e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.2315849643491674e-05, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 1214.8035888671875, "completions/mean_terminated_length": 381.6071472167969, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.090845078229904, "epoch": 0.023412647919582644, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.3575410544872284, "kl": 0.21856021694839, "learning_rate": 9.767145947321543e-06, "loss": -0.0078, "num_tokens": 30521445.0, "reward": 0.453869104385376, "reward_std": 0.3516227602958679, "rewards/unified_reward/mean": 0.4538690745830536, "rewards/unified_reward/std": 0.35162273049354553, "sampling/importance_sampling_ratio/max": 2.990307569503784, "sampling/importance_sampling_ratio/mean": 0.7348729372024536, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48768138885498047, "sampling/sampling_logp_difference/mean": 0.02085263840854168, "step": 184, "step_time": 47.93586863693781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 648.2589721679688, "completions/mean_terminated_length": 225.08139038085938, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 2.0304949581623077, "epoch": 0.023539890571319506, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.528537929058075, "kl": 0.26789233833551407, "learning_rate": 9.765873520804174e-06, "loss": 0.017, "num_tokens": 30613538.0, "reward": 0.4017857313156128, "reward_std": 0.38314303755760193, "rewards/unified_reward/mean": 0.4017857015132904, "rewards/unified_reward/std": 0.38314303755760193, "sampling/importance_sampling_ratio/max": 2.2594549655914307, "sampling/importance_sampling_ratio/mean": 0.8198783993721008, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6742033958435059, "sampling/sampling_logp_difference/mean": 0.02216327376663685, "step": 185, "step_time": 27.290638003963977 }, { "clip_ratio/high_max": 6.103515625e-05, "clip_ratio/high_mean": 8.71930842549773e-06, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.71930842549773e-06, "completions/clipped_ratio": 0.4821428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1172.009033203125, "completions/mean_terminated_length": 356.4310302734375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "entropy": 2.1839796602725983, "epoch": 0.02366713322305637, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.36288338899612427, "kl": 0.14909601397812366, "learning_rate": 9.764601094286806e-06, "loss": -0.0471, "num_tokens": 30764963.0, "reward": 0.5014881491661072, "reward_std": 0.37284284830093384, "rewards/unified_reward/mean": 0.5014881491661072, "rewards/unified_reward/std": 0.37284284830093384, "sampling/importance_sampling_ratio/max": 2.9153921604156494, "sampling/importance_sampling_ratio/mean": 0.6229192018508911, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49956679344177246, "sampling/sampling_logp_difference/mean": 0.02181105874478817, "step": 186, "step_time": 36.2754434440285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 860.6428833007812, "completions/mean_terminated_length": 385.70001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.1194036304950714, "epoch": 0.02379437587479323, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.3948516547679901, "kl": 0.14339642599225044, "learning_rate": 9.763328667769437e-06, "loss": 0.0304, "num_tokens": 30911387.0, "reward": 0.4747024178504944, "reward_std": 0.43741998076438904, "rewards/unified_reward/mean": 0.474702388048172, "rewards/unified_reward/std": 0.4374200105667114, "sampling/importance_sampling_ratio/max": 2.5387744903564453, "sampling/importance_sampling_ratio/mean": 0.7445117235183716, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4614124298095703, "sampling/sampling_logp_difference/mean": 0.021694205701351166, "step": 187, "step_time": 46.14857475506142 }, { "clip_ratio/high_max": 0.0008337894978467375, "clip_ratio/high_mean": 0.0001573923454998294, "clip_ratio/low_mean": 2.7414677333581494e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018480702328815823, "completions/clipped_ratio": 0.4375000298023224, "completions/max_length": 2048.0, "completions/max_terminated_length": 1392.0, "completions/mean_length": 1044.482177734375, "completions/mean_terminated_length": 263.96826171875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 1.7815317809581757, "epoch": 0.023921618526530092, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.46729576587677, "kl": 0.1382328812032938, "learning_rate": 9.762056241252069e-06, "loss": 0.1146, "num_tokens": 31053689.0, "reward": 0.5116071701049805, "reward_std": 0.320166677236557, "rewards/unified_reward/mean": 0.5116071701049805, "rewards/unified_reward/std": 0.320166677236557, "sampling/importance_sampling_ratio/max": 2.9144225120544434, "sampling/importance_sampling_ratio/mean": 0.774597704410553, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4315829277038574, "sampling/sampling_logp_difference/mean": 0.017468102276325226, "step": 188, "step_time": 32.66366580221802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 4.800307124241954e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 4.800307124241954e-06, "completions/clipped_ratio": 0.455357164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1241.134033203125, "completions/mean_terminated_length": 566.5409545898438, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.2644528448581696, "epoch": 0.024048861178266955, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.25147831439971924, "kl": 0.1351495496928692, "learning_rate": 9.7607838147347e-06, "loss": -0.0652, "num_tokens": 31213800.0, "reward": 0.5258928537368774, "reward_std": 0.3793522119522095, "rewards/unified_reward/mean": 0.5258928537368774, "rewards/unified_reward/std": 0.3793522119522095, "sampling/importance_sampling_ratio/max": 2.8273680210113525, "sampling/importance_sampling_ratio/mean": 0.734562337398529, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5946768522262573, "sampling/sampling_logp_difference/mean": 0.02024332992732525, "step": 189, "step_time": 36.162236808333546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 695.7500610351562, "completions/mean_terminated_length": 326.9545593261719, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.825264185667038, "epoch": 0.024176103830003817, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.2960972785949707, "kl": 0.18475137278437614, "learning_rate": 9.759511388217331e-06, "loss": -0.005, "num_tokens": 31314836.0, "reward": 0.4962798058986664, "reward_std": 0.3975948095321655, "rewards/unified_reward/mean": 0.496279776096344, "rewards/unified_reward/std": 0.3975948095321655, "sampling/importance_sampling_ratio/max": 2.7852365970611572, "sampling/importance_sampling_ratio/mean": 0.7226607203483582, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4485032558441162, "sampling/sampling_logp_difference/mean": 0.020283300429582596, "step": 190, "step_time": 28.022276101168245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 738.982177734375, "completions/mean_terminated_length": 323.1764831542969, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.7211112976074219, "epoch": 0.02430334648174068, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.31608009338378906, "kl": 0.24595000222325325, "learning_rate": 9.758238961699963e-06, "loss": 0.0959, "num_tokens": 31416106.0, "reward": 0.547619104385376, "reward_std": 0.41019207239151, "rewards/unified_reward/mean": 0.5476190447807312, "rewards/unified_reward/std": 0.41019207239151, "sampling/importance_sampling_ratio/max": 2.45432186126709, "sampling/importance_sampling_ratio/mean": 0.778264045715332, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4780392646789551, "sampling/sampling_logp_difference/mean": 0.021044163033366203, "step": 191, "step_time": 28.708542729029432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 3.401280173420673e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.401280173420673e-05, "completions/clipped_ratio": 0.3035714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 847.2678833007812, "completions/mean_terminated_length": 323.8717956542969, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.4237556755542755, "epoch": 0.02443058913347754, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.33176228404045105, "kl": 0.2191813923418522, "learning_rate": 9.756966535182594e-06, "loss": -0.1014, "num_tokens": 31536864.0, "reward": 0.5336309671401978, "reward_std": 0.32061299681663513, "rewards/unified_reward/mean": 0.5336309671401978, "rewards/unified_reward/std": 0.32061299681663513, "sampling/importance_sampling_ratio/max": 2.212101459503174, "sampling/importance_sampling_ratio/mean": 0.6404625773429871, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.614520788192749, "sampling/sampling_logp_difference/mean": 0.023013314232230186, "step": 192, "step_time": 31.263674087123945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 430.2500305175781, "completions/mean_terminated_length": 199.14285278320312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.027769058942795, "epoch": 0.024557831785214403, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.5611653923988342, "kl": 0.7574684545397758, "learning_rate": 9.755694108665226e-06, "loss": 0.0978, "num_tokens": 31609540.0, "reward": 0.6175596117973328, "reward_std": 0.3680144250392914, "rewards/unified_reward/mean": 0.617559552192688, "rewards/unified_reward/std": 0.36801445484161377, "sampling/importance_sampling_ratio/max": 2.6615452766418457, "sampling/importance_sampling_ratio/mean": 0.8562899231910706, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.40590381622314453, "sampling/sampling_logp_difference/mean": 0.023122591897845268, "step": 193, "step_time": 24.84503029100597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 443.3482360839844, "completions/mean_terminated_length": 175.90625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2062989473342896, "epoch": 0.024685074436951265, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.4704764485359192, "kl": 0.5395421981811523, "learning_rate": 9.754421682147857e-06, "loss": 0.011, "num_tokens": 31675371.0, "reward": 0.6398810148239136, "reward_std": 0.3884941637516022, "rewards/unified_reward/mean": 0.6398810148239136, "rewards/unified_reward/std": 0.3884941637516022, "sampling/importance_sampling_ratio/max": 2.9753787517547607, "sampling/importance_sampling_ratio/mean": 0.8833706974983215, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46054983139038086, "sampling/sampling_logp_difference/mean": 0.02467307075858116, "step": 194, "step_time": 23.577247082954273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 589.25, "completions/mean_terminated_length": 309.9148864746094, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.8703831434249878, "epoch": 0.024812317088688127, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.7582011222839355, "kl": 0.23942574113607407, "learning_rate": 9.753149255630488e-06, "loss": 0.0363, "num_tokens": 31764143.0, "reward": 0.3851190507411957, "reward_std": 0.34983450174331665, "rewards/unified_reward/mean": 0.3851190209388733, "rewards/unified_reward/std": 0.34983450174331665, "sampling/importance_sampling_ratio/max": 2.8940391540527344, "sampling/importance_sampling_ratio/mean": 0.795221209526062, "sampling/importance_sampling_ratio/min": 0.0008048297604545951, "sampling/sampling_logp_difference/max": 0.4680004119873047, "sampling/sampling_logp_difference/mean": 0.02343156561255455, "step": 195, "step_time": 25.92147987987846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.596095888904529e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.596095888904529e-06, "completions/clipped_ratio": 0.2053571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 685.9732666015625, "completions/mean_terminated_length": 333.98876953125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.3246504366397858, "epoch": 0.024939559740424992, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.1358681172132492, "kl": 0.22139807790517807, "learning_rate": 9.75187682911312e-06, "loss": 0.0001, "num_tokens": 31892372.0, "reward": 0.3950892984867096, "reward_std": 0.39512571692466736, "rewards/unified_reward/mean": 0.3950892984867096, "rewards/unified_reward/std": 0.39512568712234497, "sampling/importance_sampling_ratio/max": 2.962468385696411, "sampling/importance_sampling_ratio/mean": 0.7197166085243225, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4771599769592285, "sampling/sampling_logp_difference/mean": 0.023475835099816322, "step": 196, "step_time": 46.991177941905335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285746216774, "completions/max_length": 2048.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 371.9732360839844, "completions/mean_terminated_length": 243.04808044433594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5037015676498413, "epoch": 0.025066802392161854, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.912161648273468, "kl": 0.702083446085453, "learning_rate": 9.750604402595751e-06, "loss": 0.0392, "num_tokens": 31973545.0, "reward": 0.4791666865348816, "reward_std": 0.28982147574424744, "rewards/unified_reward/mean": 0.4791666865348816, "rewards/unified_reward/std": 0.2898215055465698, "sampling/importance_sampling_ratio/max": 2.5673885345458984, "sampling/importance_sampling_ratio/mean": 0.8175418972969055, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6311852931976318, "sampling/sampling_logp_difference/mean": 0.0262010358273983, "step": 197, "step_time": 33.58239530515857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 425.0982360839844, "completions/mean_terminated_length": 248.34652709960938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3891566097736359, "epoch": 0.025194045043898716, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.6663044095039368, "kl": 0.27156634256243706, "learning_rate": 9.749331976078383e-06, "loss": 0.0021, "num_tokens": 32054900.0, "reward": 0.45014888048171997, "reward_std": 0.37863683700561523, "rewards/unified_reward/mean": 0.4501488506793976, "rewards/unified_reward/std": 0.37863683700561523, "sampling/importance_sampling_ratio/max": 2.925525426864624, "sampling/importance_sampling_ratio/mean": 0.9147777557373047, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4908742904663086, "sampling/sampling_logp_difference/mean": 0.01911231502890587, "step": 198, "step_time": 27.03156040632166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 569.919677734375, "completions/mean_terminated_length": 286.8829650878906, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.8542899191379547, "epoch": 0.02532128769563558, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.35584843158721924, "kl": 0.22734422609210014, "learning_rate": 9.748059549561014e-06, "loss": -0.0695, "num_tokens": 32144763.0, "reward": 0.5425595641136169, "reward_std": 0.4015662372112274, "rewards/unified_reward/mean": 0.5425595045089722, "rewards/unified_reward/std": 0.4015662670135498, "sampling/importance_sampling_ratio/max": 2.786343812942505, "sampling/importance_sampling_ratio/mean": 0.8585513830184937, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5778579711914062, "sampling/sampling_logp_difference/mean": 0.018417876213788986, "step": 199, "step_time": 26.816623442806304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875000149011612, "completions/max_length": 2048.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 601.9375, "completions/mean_terminated_length": 268.23077392578125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.154280424118042, "epoch": 0.02544853034737244, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.39660993218421936, "kl": 0.24961626529693604, "learning_rate": 9.746787123043645e-06, "loss": -0.0043, "num_tokens": 32238220.0, "reward": 0.508928656578064, "reward_std": 0.3530854284763336, "rewards/unified_reward/mean": 0.5089285969734192, "rewards/unified_reward/std": 0.35308539867401123, "sampling/importance_sampling_ratio/max": 2.2287561893463135, "sampling/importance_sampling_ratio/mean": 0.8602425456047058, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4023704528808594, "sampling/sampling_logp_difference/mean": 0.02342001162469387, "step": 200, "step_time": 26.3537045547273 }, { "clip_ratio/high_max": 0.0003929077684006188, "clip_ratio/high_mean": 5.6129681070160586e-05, "clip_ratio/low_mean": 6.34445168543607e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011957419701502658, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 564.3125, "completions/mean_terminated_length": 137.96551513671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.8815581798553467, "epoch": 0.025575772999109302, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.42940032482147217, "kl": 0.45544107630848885, "learning_rate": 9.745514696526275e-06, "loss": 0.0424, "num_tokens": 32328687.0, "reward": 0.6558035612106323, "reward_std": 0.34333863854408264, "rewards/unified_reward/mean": 0.6558035612106323, "rewards/unified_reward/std": 0.34333866834640503, "sampling/importance_sampling_ratio/max": 2.318068265914917, "sampling/importance_sampling_ratio/mean": 0.8532822728157043, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7178783416748047, "sampling/sampling_logp_difference/mean": 0.018834266811609268, "step": 201, "step_time": 27.95240018889308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2321428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 672.0625, "completions/mean_terminated_length": 256.0813903808594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0560831427574158, "epoch": 0.025703015650846164, "frac_reward_zero_std": 0.5, "grad_norm": 0.19815689325332642, "kl": 0.4592304155230522, "learning_rate": 9.744242270008908e-06, "loss": -0.0143, "num_tokens": 32430446.0, "reward": 0.5535714626312256, "reward_std": 0.4126468300819397, "rewards/unified_reward/mean": 0.5535714030265808, "rewards/unified_reward/std": 0.4126468598842621, "sampling/importance_sampling_ratio/max": 2.5304441452026367, "sampling/importance_sampling_ratio/mean": 0.7766565680503845, "sampling/importance_sampling_ratio/min": 0.0013007745146751404, "sampling/sampling_logp_difference/max": 0.4004955291748047, "sampling/sampling_logp_difference/mean": 0.023236144334077835, "step": 202, "step_time": 27.922265715897083 }, { "clip_ratio/high_max": 0.0002267916570417583, "clip_ratio/high_mean": 3.239880970795639e-05, "clip_ratio/low_mean": 4.8581614919385174e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.098042098936276e-05, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 668.7232666015625, "completions/mean_terminated_length": 208.96429443359375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.517095923423767, "epoch": 0.025830258302583026, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.3131510317325592, "kl": 0.3119339197874069, "learning_rate": 9.742969843491538e-06, "loss": -0.0171, "num_tokens": 32530007.0, "reward": 0.37886908650398254, "reward_std": 0.3832255005836487, "rewards/unified_reward/mean": 0.37886905670166016, "rewards/unified_reward/std": 0.3832255005836487, "sampling/importance_sampling_ratio/max": 2.2734882831573486, "sampling/importance_sampling_ratio/mean": 0.720687747001648, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.3840376138687134, "sampling/sampling_logp_difference/mean": 0.02614304982125759, "step": 203, "step_time": 29.11460607405752 }, { "clip_ratio/high_max": 5.239991514827125e-05, "clip_ratio/high_mean": 7.485702099074842e-06, "clip_ratio/low_mean": 6.0769589254050516e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 1.3562661024479894e-05, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 751.5267944335938, "completions/mean_terminated_length": 319.3690490722656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0472882986068726, "epoch": 0.02595750095431989, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.34200379252433777, "kl": 0.2324501983821392, "learning_rate": 9.741697416974171e-06, "loss": 0.0398, "num_tokens": 32633738.0, "reward": 0.45505955815315247, "reward_std": 0.3190470039844513, "rewards/unified_reward/mean": 0.4550595283508301, "rewards/unified_reward/std": 0.3190469741821289, "sampling/importance_sampling_ratio/max": 2.9064722061157227, "sampling/importance_sampling_ratio/mean": 0.6802142262458801, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.43967485427856445, "sampling/sampling_logp_difference/mean": 0.027453960850834846, "step": 204, "step_time": 29.074410883011296 }, { "clip_ratio/high_max": 0.00047704124517622404, "clip_ratio/high_mean": 6.814874905103352e-05, "clip_ratio/low_mean": 1.1379413081158418e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.952816304168664e-05, "completions/clipped_ratio": 0.2946428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 788.9464721679688, "completions/mean_terminated_length": 263.0126647949219, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4044359922409058, "epoch": 0.02608474360605675, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.3583173453807831, "kl": 0.42284800857305527, "learning_rate": 9.7404249904568e-06, "loss": 0.0123, "num_tokens": 32764732.0, "reward": 0.5252976417541504, "reward_std": 0.36961671710014343, "rewards/unified_reward/mean": 0.5252976417541504, "rewards/unified_reward/std": 0.36961671710014343, "sampling/importance_sampling_ratio/max": 2.8625729084014893, "sampling/importance_sampling_ratio/mean": 0.7581349611282349, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.40637850761413574, "sampling/sampling_logp_difference/mean": 0.02582349069416523, "step": 205, "step_time": 35.104940097546205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 979.0535888671875, "completions/mean_terminated_length": 493.1688232421875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.429672598838806, "epoch": 0.026211986257793612, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.31104299426078796, "kl": 0.3725324310362339, "learning_rate": 9.739152563939434e-06, "loss": 0.0458, "num_tokens": 32906602.0, "reward": 0.626339316368103, "reward_std": 0.3052234351634979, "rewards/unified_reward/mean": 0.626339316368103, "rewards/unified_reward/std": 0.3052234351634979, "sampling/importance_sampling_ratio/max": 2.7010488510131836, "sampling/importance_sampling_ratio/mean": 0.6467918753623962, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5001654624938965, "sampling/sampling_logp_difference/mean": 0.024371478706598282, "step": 206, "step_time": 32.95059931604192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 511.0535888671875, "completions/mean_terminated_length": 216.7446746826172, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.763594925403595, "epoch": 0.026339228909530474, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.4145755171775818, "kl": 0.3846841864287853, "learning_rate": 9.737880137422064e-06, "loss": -0.011, "num_tokens": 32985864.0, "reward": 0.4293155074119568, "reward_std": 0.37588897347450256, "rewards/unified_reward/mean": 0.4293155074119568, "rewards/unified_reward/std": 0.37588897347450256, "sampling/importance_sampling_ratio/max": 1.8131407499313354, "sampling/importance_sampling_ratio/mean": 0.7832189202308655, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4663882255554199, "sampling/sampling_logp_difference/mean": 0.02695477567613125, "step": 207, "step_time": 24.720893257996067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 772.9375610351562, "completions/mean_terminated_length": 425.1932067871094, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4543448090553284, "epoch": 0.026466471561267337, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.5542331337928772, "kl": 0.20757080242037773, "learning_rate": 9.736607710904697e-06, "loss": 0.0134, "num_tokens": 33090585.0, "reward": 0.5446428656578064, "reward_std": 0.3669683039188385, "rewards/unified_reward/mean": 0.5446428656578064, "rewards/unified_reward/std": 0.3669683039188385, "sampling/importance_sampling_ratio/max": 2.6637966632843018, "sampling/importance_sampling_ratio/mean": 0.6859043836593628, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.3983726501464844, "sampling/sampling_logp_difference/mean": 0.026424117386341095, "step": 208, "step_time": 28.66267484333366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 705.8839721679688, "completions/mean_terminated_length": 339.852294921875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.731836348772049, "epoch": 0.0265937142130042, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.4299235939979553, "kl": 0.19726766273379326, "learning_rate": 9.735335284387326e-06, "loss": 0.053, "num_tokens": 33191652.0, "reward": 0.6354167461395264, "reward_std": 0.37152209877967834, "rewards/unified_reward/mean": 0.6354166865348816, "rewards/unified_reward/std": 0.37152206897735596, "sampling/importance_sampling_ratio/max": 2.9097166061401367, "sampling/importance_sampling_ratio/mean": 0.8131805062294006, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4488215446472168, "sampling/sampling_logp_difference/mean": 0.01889723539352417, "step": 209, "step_time": 27.99887882801704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 395.77679443359375, "completions/mean_terminated_length": 215.83168029785156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.06941419839859, "epoch": 0.02672095686474106, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.37405645847320557, "kl": 0.49447696283459663, "learning_rate": 9.73406285786996e-06, "loss": 0.0923, "num_tokens": 33262491.0, "reward": 0.47604167461395264, "reward_std": 0.3585909903049469, "rewards/unified_reward/mean": 0.47604164481163025, "rewards/unified_reward/std": 0.3585909605026245, "sampling/importance_sampling_ratio/max": 2.142735004425049, "sampling/importance_sampling_ratio/mean": 0.7281144261360168, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4550352096557617, "sampling/sampling_logp_difference/mean": 0.025361616164445877, "step": 210, "step_time": 27.509058783994988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3035714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 793.7410888671875, "completions/mean_terminated_length": 247.0128173828125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.8646839261054993, "epoch": 0.026848199516477923, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.4275408089160919, "kl": 0.18945287540555, "learning_rate": 9.73279043135259e-06, "loss": -0.0042, "num_tokens": 33370062.0, "reward": 0.4529762268066406, "reward_std": 0.41807398200035095, "rewards/unified_reward/mean": 0.45297619700431824, "rewards/unified_reward/std": 0.41807398200035095, "sampling/importance_sampling_ratio/max": 2.674398422241211, "sampling/importance_sampling_ratio/mean": 0.831838071346283, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6572577953338623, "sampling/sampling_logp_difference/mean": 0.019180798903107643, "step": 211, "step_time": 30.155662928009406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3750000298023224, "completions/max_length": 2048.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 957.8750610351562, "completions/mean_terminated_length": 303.79998779296875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.82140976190567, "epoch": 0.026975442168214785, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.2129037082195282, "kl": 0.23710469156503677, "learning_rate": 9.731518004835222e-06, "loss": 0.0083, "num_tokens": 33496632.0, "reward": 0.5851190686225891, "reward_std": 0.3421020805835724, "rewards/unified_reward/mean": 0.5851190686225891, "rewards/unified_reward/std": 0.34210205078125, "sampling/importance_sampling_ratio/max": 2.360651731491089, "sampling/importance_sampling_ratio/mean": 0.5933659672737122, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4268975257873535, "sampling/sampling_logp_difference/mean": 0.026330536231398582, "step": 212, "step_time": 31.59147632983513 }, { "clip_ratio/high_max": 5.111428981763311e-05, "clip_ratio/high_mean": 7.302041467482923e-06, "clip_ratio/low_mean": 2.9511277716665063e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.6813319184147986e-05, "completions/clipped_ratio": 0.2857142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 790.4910888671875, "completions/mean_terminated_length": 287.4875183105469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.5701984167099, "epoch": 0.027102684819951647, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.2508915960788727, "kl": 0.3214876763522625, "learning_rate": 9.730245578317852e-06, "loss": -0.058, "num_tokens": 33610807.0, "reward": 0.556398868560791, "reward_std": 0.34765225648880005, "rewards/unified_reward/mean": 0.5563988089561462, "rewards/unified_reward/std": 0.34765228629112244, "sampling/importance_sampling_ratio/max": 2.5087969303131104, "sampling/importance_sampling_ratio/mean": 0.7349241971969604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5926212072372437, "sampling/sampling_logp_difference/mean": 0.02387704700231552, "step": 213, "step_time": 30.132840274833143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 685.4375610351562, "completions/mean_terminated_length": 140.41250610351562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.5170543789863586, "epoch": 0.02722992747168851, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.6963781714439392, "kl": 0.5418679714202881, "learning_rate": 9.728973151800485e-06, "loss": -0.0765, "num_tokens": 33736520.0, "reward": 0.42038694024086, "reward_std": 0.34988707304000854, "rewards/unified_reward/mean": 0.42038694024086, "rewards/unified_reward/std": 0.34988707304000854, "sampling/importance_sampling_ratio/max": 2.738248825073242, "sampling/importance_sampling_ratio/mean": 0.6897520422935486, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.179931640625, "sampling/sampling_logp_difference/mean": 0.032753244042396545, "step": 214, "step_time": 39.40715455519967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.330357164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 777.3928833007812, "completions/mean_terminated_length": 150.55999755859375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.6690027713775635, "epoch": 0.02735717012342537, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.23847819864749908, "kl": 0.241555105894804, "learning_rate": 9.727700725283115e-06, "loss": 0.0698, "num_tokens": 33842908.0, "reward": 0.6361607313156128, "reward_std": 0.4034690260887146, "rewards/unified_reward/mean": 0.6361607313156128, "rewards/unified_reward/std": 0.4034689962863922, "sampling/importance_sampling_ratio/max": 2.9239282608032227, "sampling/importance_sampling_ratio/mean": 0.7637571692466736, "sampling/importance_sampling_ratio/min": 0.0036099355202168226, "sampling/sampling_logp_difference/max": 0.40044665336608887, "sampling/sampling_logp_difference/mean": 0.023345673456788063, "step": 215, "step_time": 29.878852458670735 }, { "clip_ratio/high_max": 0.0006501048992504366, "clip_ratio/high_mean": 9.287213106290437e-05, "clip_ratio/low_mean": 5.810034963360522e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 9.868216511677019e-05, "completions/clipped_ratio": 0.2589285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 647.8125, "completions/mean_terminated_length": 158.59036254882812, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5392778515815735, "epoch": 0.027484412775162233, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.6573565006256104, "kl": 0.30719585716724396, "learning_rate": 9.726428298765746e-06, "loss": -0.0297, "num_tokens": 33942703.0, "reward": 0.439732164144516, "reward_std": 0.3391404449939728, "rewards/unified_reward/mean": 0.4397321343421936, "rewards/unified_reward/std": 0.3391404449939728, "sampling/importance_sampling_ratio/max": 2.7644052505493164, "sampling/importance_sampling_ratio/mean": 0.8178528547286987, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0049521923065186, "sampling/sampling_logp_difference/mean": 0.02573239430785179, "step": 216, "step_time": 29.380366128869355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3392857313156128, "completions/max_length": 2048.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 774.107177734375, "completions/mean_terminated_length": 119.9459457397461, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.842689275741577, "epoch": 0.027611655426899098, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.4599837362766266, "kl": 0.24186299741268158, "learning_rate": 9.72515587224838e-06, "loss": 0.0565, "num_tokens": 34053611.0, "reward": 0.5339285731315613, "reward_std": 0.35914987325668335, "rewards/unified_reward/mean": 0.5339285731315613, "rewards/unified_reward/std": 0.35914987325668335, "sampling/importance_sampling_ratio/max": 2.8924782276153564, "sampling/importance_sampling_ratio/mean": 0.8042206168174744, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9685895442962646, "sampling/sampling_logp_difference/mean": 0.02473442628979683, "step": 217, "step_time": 32.28478726930916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.432438774732873e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.432438774732873e-05, "completions/clipped_ratio": 0.3035714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 698.7232666015625, "completions/mean_terminated_length": 110.5769271850586, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.073794811964035, "epoch": 0.02773889807863596, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.3298756778240204, "kl": 0.4871565103530884, "learning_rate": 9.723883445731009e-06, "loss": -0.1664, "num_tokens": 34158796.0, "reward": 0.5674107074737549, "reward_std": 0.41517987847328186, "rewards/unified_reward/mean": 0.5674107074737549, "rewards/unified_reward/std": 0.4151798486709595, "sampling/importance_sampling_ratio/max": 1.8284425735473633, "sampling/importance_sampling_ratio/mean": 0.8485844731330872, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5759811401367188, "sampling/sampling_logp_difference/mean": 0.01976759172976017, "step": 218, "step_time": 30.227933151181787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1517.0, "completions/mean_length": 781.0892944335938, "completions/mean_terminated_length": 205.2207794189453, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.342552125453949, "epoch": 0.027866140730372822, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.39878299832344055, "kl": 0.2696391586214304, "learning_rate": 9.722611019213642e-06, "loss": 0.0459, "num_tokens": 34273478.0, "reward": 0.5377976894378662, "reward_std": 0.39646828174591064, "rewards/unified_reward/mean": 0.5377976298332214, "rewards/unified_reward/std": 0.39646828174591064, "sampling/importance_sampling_ratio/max": 2.5557572841644287, "sampling/importance_sampling_ratio/mean": 0.7682135701179504, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.603082537651062, "sampling/sampling_logp_difference/mean": 0.021105265244841576, "step": 219, "step_time": 30.721682591130957 }, { "clip_ratio/high_max": 0.000629670059424825, "clip_ratio/high_mean": 8.995286589197349e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.995286589197349e-05, "completions/clipped_ratio": 0.2410714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 604.875, "completions/mean_terminated_length": 146.47059631347656, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.7909165024757385, "epoch": 0.027993383382109684, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.5174288749694824, "kl": 0.5542485378682613, "learning_rate": 9.721338592696272e-06, "loss": 0.0583, "num_tokens": 34364136.0, "reward": 0.4888392984867096, "reward_std": 0.4133131504058838, "rewards/unified_reward/mean": 0.4888392984867096, "rewards/unified_reward/std": 0.4133131802082062, "sampling/importance_sampling_ratio/max": 2.2063262462615967, "sampling/importance_sampling_ratio/mean": 0.8675354719161987, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6195559501647949, "sampling/sampling_logp_difference/mean": 0.020818671211600304, "step": 220, "step_time": 28.47435105405748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1071428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 293.39288330078125, "completions/mean_terminated_length": 82.83999633789062, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7445549368858337, "epoch": 0.028120626033846546, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.34252721071243286, "kl": 0.6131259351968765, "learning_rate": 9.720066166178905e-06, "loss": 0.0964, "num_tokens": 34421620.0, "reward": 0.7083333730697632, "reward_std": 0.3212915062904358, "rewards/unified_reward/mean": 0.7083333730697632, "rewards/unified_reward/std": 0.3212914764881134, "sampling/importance_sampling_ratio/max": 2.323822021484375, "sampling/importance_sampling_ratio/mean": 0.9254361987113953, "sampling/importance_sampling_ratio/min": 0.006902977358549833, "sampling/sampling_logp_difference/max": 0.5473651885986328, "sampling/sampling_logp_difference/mean": 0.02315010316669941, "step": 221, "step_time": 23.02014463674277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 672.7410888671875, "completions/mean_terminated_length": 122.63750457763672, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.0379561483860016, "epoch": 0.02824786868558341, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.2736360430717468, "kl": 0.2983572669327259, "learning_rate": 9.718793739661535e-06, "loss": 0.1436, "num_tokens": 34520151.0, "reward": 0.4181548058986664, "reward_std": 0.4006660580635071, "rewards/unified_reward/mean": 0.418154776096344, "rewards/unified_reward/std": 0.4006660580635071, "sampling/importance_sampling_ratio/max": 1.86128830909729, "sampling/importance_sampling_ratio/mean": 0.8332432508468628, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.3637291193008423, "sampling/sampling_logp_difference/mean": 0.016925644129514694, "step": 222, "step_time": 29.763668784173205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1607142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1629.0, "completions/mean_length": 495.7589416503906, "completions/mean_terminated_length": 198.52127075195312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5788563191890717, "epoch": 0.02837511133732027, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.4734818637371063, "kl": 0.38309090957045555, "learning_rate": 9.717521313144168e-06, "loss": 0.1764, "num_tokens": 34595068.0, "reward": 0.4891369342803955, "reward_std": 0.34046947956085205, "rewards/unified_reward/mean": 0.4891369044780731, "rewards/unified_reward/std": 0.34046944975852966, "sampling/importance_sampling_ratio/max": 2.556257724761963, "sampling/importance_sampling_ratio/mean": 0.9216794967651367, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.38244783878326416, "sampling/sampling_logp_difference/mean": 0.01770443096756935, "step": 223, "step_time": 26.374372799880803 }, { "clip_ratio/high_max": 0.0006402064282156061, "clip_ratio/high_mean": 9.145806234300835e-05, "clip_ratio/low_mean": 2.3513896394433687e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00011497195555421058, "completions/clipped_ratio": 0.2410714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 623.9285888671875, "completions/mean_terminated_length": 171.57647705078125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.242325931787491, "epoch": 0.028502353989057132, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.4207785129547119, "kl": 0.38019660860300064, "learning_rate": 9.716248886626798e-06, "loss": -0.0127, "num_tokens": 34693412.0, "reward": 0.6302083730697632, "reward_std": 0.33535072207450867, "rewards/unified_reward/mean": 0.6302083730697632, "rewards/unified_reward/std": 0.33535072207450867, "sampling/importance_sampling_ratio/max": 2.593384265899658, "sampling/importance_sampling_ratio/mean": 0.7414463758468628, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47874975204467773, "sampling/sampling_logp_difference/mean": 0.024015303701162338, "step": 224, "step_time": 31.990096481051296 }, { "clip_ratio/high_max": 0.0004329211624281015, "clip_ratio/high_mean": 6.184588073665509e-05, "clip_ratio/low_mean": 4.7486339553870494e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00010933221801678883, "completions/clipped_ratio": 0.3214285969734192, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 777.9464721679688, "completions/mean_terminated_length": 176.34210205078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.7706666886806488, "epoch": 0.028629596640793994, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.22869502007961273, "kl": 0.2513616103678942, "learning_rate": 9.71497646010943e-06, "loss": 0.0206, "num_tokens": 34808534.0, "reward": 0.47857144474983215, "reward_std": 0.3505496084690094, "rewards/unified_reward/mean": 0.47857141494750977, "rewards/unified_reward/std": 0.3505496084690094, "sampling/importance_sampling_ratio/max": 1.4452309608459473, "sampling/importance_sampling_ratio/mean": 0.6807414293289185, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9970874786376953, "sampling/sampling_logp_difference/mean": 0.014386341907083988, "step": 225, "step_time": 30.103736866731197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1309.0, "completions/mean_length": 347.5625305175781, "completions/mean_terminated_length": 104.64285278320312, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.9857953786849976, "epoch": 0.028756839292530856, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.7750895619392395, "kl": 0.9272170066833496, "learning_rate": 9.71370403359206e-06, "loss": -0.0856, "num_tokens": 34871637.0, "reward": 0.5453869104385376, "reward_std": 0.3235672116279602, "rewards/unified_reward/mean": 0.5453869104385376, "rewards/unified_reward/std": 0.3235672116279602, "sampling/importance_sampling_ratio/max": 2.0821876525878906, "sampling/importance_sampling_ratio/mean": 0.916778028011322, "sampling/importance_sampling_ratio/min": 0.009557248093187809, "sampling/sampling_logp_difference/max": 0.5510153770446777, "sampling/sampling_logp_difference/mean": 0.020470449700951576, "step": 226, "step_time": 24.093623261898756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.196428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 514.5357666015625, "completions/mean_terminated_length": 139.6888885498047, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5747458934783936, "epoch": 0.02888408194426772, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 1.069998025894165, "kl": 1.082310751080513, "learning_rate": 9.712431607074693e-06, "loss": 0.2122, "num_tokens": 34956905.0, "reward": 0.6464285850524902, "reward_std": 0.31882405281066895, "rewards/unified_reward/mean": 0.6464285850524902, "rewards/unified_reward/std": 0.31882408261299133, "sampling/importance_sampling_ratio/max": 2.909119129180908, "sampling/importance_sampling_ratio/mean": 0.8860241174697876, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7938251495361328, "sampling/sampling_logp_difference/mean": 0.01683906838297844, "step": 227, "step_time": 27.082452940754592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 306.45538330078125, "completions/mean_terminated_length": 116.78217315673828, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.3451770544052124, "epoch": 0.02901132459600458, "frac_reward_zero_std": 0.6428571939468384, "grad_norm": 0.4549373984336853, "kl": 0.8204703629016876, "learning_rate": 9.711159180557323e-06, "loss": 0.022, "num_tokens": 35020772.0, "reward": 0.7053571939468384, "reward_std": 0.4186781048774719, "rewards/unified_reward/mean": 0.7053571343421936, "rewards/unified_reward/std": 0.4186781048774719, "sampling/importance_sampling_ratio/max": 2.74229097366333, "sampling/importance_sampling_ratio/mean": 0.8869180679321289, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6055099964141846, "sampling/sampling_logp_difference/mean": 0.022500498220324516, "step": 228, "step_time": 25.691305039916188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 582.8482666015625, "completions/mean_terminated_length": 161.8275909423828, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.643970251083374, "epoch": 0.029138567247741443, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.295050710439682, "kl": 0.5408705100417137, "learning_rate": 9.709886754039956e-06, "loss": -0.0525, "num_tokens": 35110323.0, "reward": 0.42470237612724304, "reward_std": 0.39833226799964905, "rewards/unified_reward/mean": 0.42470237612724304, "rewards/unified_reward/std": 0.39833223819732666, "sampling/importance_sampling_ratio/max": 2.9846622943878174, "sampling/importance_sampling_ratio/mean": 0.8312628865242004, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5699558258056641, "sampling/sampling_logp_difference/mean": 0.02095576748251915, "step": 229, "step_time": 28.37922864826396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 465.3839416503906, "completions/mean_terminated_length": 201.61459350585938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.7170169353485107, "epoch": 0.029265809899478305, "frac_reward_zero_std": 0.5, "grad_norm": 0.5483115911483765, "kl": 0.3179236277937889, "learning_rate": 9.708614327522586e-06, "loss": -0.1744, "num_tokens": 35186870.0, "reward": 0.7648810148239136, "reward_std": 0.3305465877056122, "rewards/unified_reward/mean": 0.7648810148239136, "rewards/unified_reward/std": 0.3305465579032898, "sampling/importance_sampling_ratio/max": 2.7275683879852295, "sampling/importance_sampling_ratio/mean": 0.861115038394928, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.431354284286499, "sampling/sampling_logp_difference/mean": 0.017885245382785797, "step": 230, "step_time": 24.858076949836686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.392857164144516, "completions/max_length": 2048.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 933.0803833007812, "completions/mean_terminated_length": 211.66175842285156, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.18178790807724, "epoch": 0.029393052551215167, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.24361327290534973, "kl": 0.39489684998989105, "learning_rate": 9.707341901005217e-06, "loss": 0.0452, "num_tokens": 35337159.0, "reward": 0.5066964626312256, "reward_std": 0.30886515974998474, "rewards/unified_reward/mean": 0.5066964030265808, "rewards/unified_reward/std": 0.30886518955230713, "sampling/importance_sampling_ratio/max": 2.6840362548828125, "sampling/importance_sampling_ratio/mean": 0.7969599962234497, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.43159055709838867, "sampling/sampling_logp_difference/mean": 0.02065310999751091, "step": 231, "step_time": 46.376890543149784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1187.0, "completions/mean_length": 746.794677734375, "completions/mean_terminated_length": 155.33766174316406, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.6267953217029572, "epoch": 0.02952029520295203, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.29824069142341614, "kl": 0.3354276902973652, "learning_rate": 9.706069474487849e-06, "loss": 0.0093, "num_tokens": 35439416.0, "reward": 0.6119047999382019, "reward_std": 0.3844606280326843, "rewards/unified_reward/mean": 0.6119047403335571, "rewards/unified_reward/std": 0.38446059823036194, "sampling/importance_sampling_ratio/max": 2.1693179607391357, "sampling/importance_sampling_ratio/mean": 0.8501747846603394, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5766687393188477, "sampling/sampling_logp_difference/mean": 0.014958404004573822, "step": 232, "step_time": 29.084470732836053 }, { "clip_ratio/high_max": 7.229612674564123e-05, "clip_ratio/high_mean": 1.0328018106520176e-05, "clip_ratio/low_mean": 5.611042524833465e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 1.593906063135364e-05, "completions/clipped_ratio": 0.1428571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 449.9464416503906, "completions/mean_terminated_length": 183.6041717529297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5768753588199615, "epoch": 0.02964753785468889, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.3868400454521179, "kl": 0.5139264091849327, "learning_rate": 9.70479704797048e-06, "loss": -0.0209, "num_tokens": 35532602.0, "reward": 0.540178656578064, "reward_std": 0.3905448615550995, "rewards/unified_reward/mean": 0.5401785969734192, "rewards/unified_reward/std": 0.3905448913574219, "sampling/importance_sampling_ratio/max": 2.4723780155181885, "sampling/importance_sampling_ratio/mean": 0.8153262734413147, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.608701229095459, "sampling/sampling_logp_difference/mean": 0.02129625529050827, "step": 233, "step_time": 33.63647404918447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3035714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 840.9285888671875, "completions/mean_terminated_length": 314.76922607421875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.7320332527160645, "epoch": 0.029774780506425753, "frac_reward_zero_std": 0.0714285746216774, "grad_norm": 0.4240826964378357, "kl": 0.25550612062215805, "learning_rate": 9.703524621453112e-06, "loss": 0.1174, "num_tokens": 35659138.0, "reward": 0.6212798357009888, "reward_std": 0.3408331274986267, "rewards/unified_reward/mean": 0.621279776096344, "rewards/unified_reward/std": 0.3408331274986267, "sampling/importance_sampling_ratio/max": 2.4219560623168945, "sampling/importance_sampling_ratio/mean": 0.7752193212509155, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.488433837890625, "sampling/sampling_logp_difference/mean": 0.022041138261556625, "step": 234, "step_time": 33.33434915728867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 538.7767944335938, "completions/mean_terminated_length": 105.09195709228516, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.9275766611099243, "epoch": 0.029902023158162615, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.4891400635242462, "kl": 0.4097444415092468, "learning_rate": 9.702252194935743e-06, "loss": 0.1626, "num_tokens": 35753081.0, "reward": 0.5267857909202576, "reward_std": 0.39162498712539673, "rewards/unified_reward/mean": 0.5267857313156128, "rewards/unified_reward/std": 0.39162498712539673, "sampling/importance_sampling_ratio/max": 2.037155866622925, "sampling/importance_sampling_ratio/mean": 0.8169717788696289, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.44744443893432617, "sampling/sampling_logp_difference/mean": 0.0263386033475399, "step": 235, "step_time": 29.94002115004696 }, { "clip_ratio/high_max": 0.0002550014978623949, "clip_ratio/high_mean": 3.642878527898574e-05, "clip_ratio/low_mean": 5.659053385898005e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 4.208784139336785e-05, "completions/clipped_ratio": 0.2946428656578064, "completions/max_length": 2048.0, "completions/max_terminated_length": 1951.0, "completions/mean_length": 826.9017944335938, "completions/mean_terminated_length": 316.8227844238281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.918336570262909, "epoch": 0.030029265809899477, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.2991235554218292, "kl": 0.2981361076235771, "learning_rate": 9.700979768418374e-06, "loss": 0.0871, "num_tokens": 35874222.0, "reward": 0.45505955815315247, "reward_std": 0.3333321213722229, "rewards/unified_reward/mean": 0.4550595283508301, "rewards/unified_reward/std": 0.3333321213722229, "sampling/importance_sampling_ratio/max": 2.4596736431121826, "sampling/importance_sampling_ratio/mean": 0.6445527672767639, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5044355392456055, "sampling/sampling_logp_difference/mean": 0.023621955886483192, "step": 236, "step_time": 30.990062535973266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0714285746216774, "completions/max_length": 2048.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 287.4107360839844, "completions/mean_terminated_length": 151.98077392578125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.584237277507782, "epoch": 0.03015650846163634, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 0.7558721303939819, "kl": 0.7390472292900085, "learning_rate": 9.699707341901006e-06, "loss": 0.1296, "num_tokens": 35935988.0, "reward": 0.5336310267448425, "reward_std": 0.34928053617477417, "rewards/unified_reward/mean": 0.5336309671401978, "rewards/unified_reward/std": 0.34928056597709656, "sampling/importance_sampling_ratio/max": 2.1044178009033203, "sampling/importance_sampling_ratio/mean": 0.8221467137336731, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5238933563232422, "sampling/sampling_logp_difference/mean": 0.027515284717082977, "step": 237, "step_time": 22.749790735077113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.133928582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 428.8125305175781, "completions/mean_terminated_length": 178.42266845703125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 2.912869155406952, "epoch": 0.030283751113373204, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.598240852355957, "kl": 0.6749486923217773, "learning_rate": 9.698434915383637e-06, "loss": 0.0988, "num_tokens": 36023311.0, "reward": 0.5312500596046448, "reward_std": 0.3562197685241699, "rewards/unified_reward/mean": 0.5312500596046448, "rewards/unified_reward/std": 0.35621973872184753, "sampling/importance_sampling_ratio/max": 2.4971847534179688, "sampling/importance_sampling_ratio/mean": 0.8435335755348206, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6812305450439453, "sampling/sampling_logp_difference/mean": 0.025169646367430687, "step": 238, "step_time": 29.95234189927578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 723.2232666015625, "completions/mean_terminated_length": 302.4117736816406, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.1702064275741577, "epoch": 0.030410993765110066, "frac_reward_zero_std": 0.1428571492433548, "grad_norm": 0.41887596249580383, "kl": 0.2986025959253311, "learning_rate": 9.697162488866269e-06, "loss": -0.0056, "num_tokens": 36122592.0, "reward": 0.5389881134033203, "reward_std": 0.34118056297302246, "rewards/unified_reward/mean": 0.5389881134033203, "rewards/unified_reward/std": 0.3411805331707001, "sampling/importance_sampling_ratio/max": 2.696528911590576, "sampling/importance_sampling_ratio/mean": 0.7810502052307129, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.560575008392334, "sampling/sampling_logp_difference/mean": 0.023790409788489342, "step": 239, "step_time": 28.967305813916028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571827709675, "completions/max_length": 2048.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 140.86607360839844, "completions/mean_terminated_length": 123.68468475341797, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9623602032661438, "epoch": 0.03053823641684693, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 1.4977779388427734, "kl": 0.9640597254037857, "learning_rate": 9.6958900623489e-06, "loss": -0.0101, "num_tokens": 36172537.0, "reward": 0.626339316368103, "reward_std": 0.33999088406562805, "rewards/unified_reward/mean": 0.626339316368103, "rewards/unified_reward/std": 0.33999085426330566, "sampling/importance_sampling_ratio/max": 2.900010347366333, "sampling/importance_sampling_ratio/mean": 0.9371185898780823, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6883325576782227, "sampling/sampling_logp_difference/mean": 0.023509999737143517, "step": 240, "step_time": 18.42369587509893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 421.6607360839844, "completions/mean_terminated_length": 189.32652282714844, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 1.7158733010292053, "epoch": 0.03066547906858379, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.5326976776123047, "kl": 0.35143212229013443, "learning_rate": 9.694617635831531e-06, "loss": 0.0992, "num_tokens": 36242187.0, "reward": 0.5727678537368774, "reward_std": 0.4318697452545166, "rewards/unified_reward/mean": 0.5727677941322327, "rewards/unified_reward/std": 0.4318697452545166, "sampling/importance_sampling_ratio/max": 1.7960823774337769, "sampling/importance_sampling_ratio/mean": 0.8166316151618958, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5414609909057617, "sampling/sampling_logp_difference/mean": 0.016885999590158463, "step": 241, "step_time": 23.090080826776102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01785714365541935, "completions/max_length": 2048.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 93.08036041259766, "completions/mean_terminated_length": 57.53636169433594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.155844956636429, "epoch": 0.030792721720320652, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 1.9577434062957764, "kl": 1.434325784444809, "learning_rate": 9.693345209314163e-06, "loss": 0.1912, "num_tokens": 36274308.0, "reward": 0.5491071939468384, "reward_std": 0.3858018219470978, "rewards/unified_reward/mean": 0.5491071343421936, "rewards/unified_reward/std": 0.3858018219470978, "sampling/importance_sampling_ratio/max": 2.2974421977996826, "sampling/importance_sampling_ratio/mean": 0.9430510401725769, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6158227920532227, "sampling/sampling_logp_difference/mean": 0.028305668383836746, "step": 242, "step_time": 18.144755577202886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0446428582072258, "completions/max_length": 2048.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 260.40179443359375, "completions/mean_terminated_length": 176.86915588378906, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7547202110290527, "epoch": 0.030919964372057514, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.7852715253829956, "kl": 0.9419408366084099, "learning_rate": 9.692072782796794e-06, "loss": 0.275, "num_tokens": 36347777.0, "reward": 0.656994104385376, "reward_std": 0.4043596088886261, "rewards/unified_reward/mean": 0.6569940447807312, "rewards/unified_reward/std": 0.4043596088886261, "sampling/importance_sampling_ratio/max": 2.6987149715423584, "sampling/importance_sampling_ratio/mean": 0.9037569761276245, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6285943984985352, "sampling/sampling_logp_difference/mean": 0.023043936118483543, "step": 243, "step_time": 29.001339801121503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.098214291036129, "completions/max_length": 2048.0, "completions/max_terminated_length": 1665.0, "completions/mean_length": 350.5714416503906, "completions/mean_terminated_length": 165.70297241210938, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1092375218868256, "epoch": 0.031047207023794376, "frac_reward_zero_std": 0.6428571939468384, "grad_norm": 0.9054849147796631, "kl": 0.6273559182882309, "learning_rate": 9.690800356279426e-06, "loss": -0.3175, "num_tokens": 36414409.0, "reward": 0.627678632736206, "reward_std": 0.4124899208545685, "rewards/unified_reward/mean": 0.6276785731315613, "rewards/unified_reward/std": 0.4124898612499237, "sampling/importance_sampling_ratio/max": 2.4969682693481445, "sampling/importance_sampling_ratio/mean": 0.9957541823387146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5118980407714844, "sampling/sampling_logp_difference/mean": 0.021734677255153656, "step": 244, "step_time": 24.068179310066625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0803571492433548, "completions/max_length": 2048.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 222.5803680419922, "completions/mean_terminated_length": 63.07767105102539, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.0933182537555695, "epoch": 0.03117444967553124, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.7861321568489075, "kl": 0.8524122685194016, "learning_rate": 9.689527929762057e-06, "loss": 0.0075, "num_tokens": 36495682.0, "reward": 0.5505952835083008, "reward_std": 0.3722340762615204, "rewards/unified_reward/mean": 0.550595223903656, "rewards/unified_reward/std": 0.372234046459198, "sampling/importance_sampling_ratio/max": 2.3953537940979004, "sampling/importance_sampling_ratio/mean": 0.9295626878738403, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8971774578094482, "sampling/sampling_logp_difference/mean": 0.024969883263111115, "step": 245, "step_time": 34.52763933502138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1160714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 346.6250305175781, "completions/mean_terminated_length": 123.21212005615234, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "entropy": 2.505423665046692, "epoch": 0.0313016923272681, "frac_reward_zero_std": 0.2142857313156128, "grad_norm": 0.6553359031677246, "kl": 0.6488655656576157, "learning_rate": 9.688255503244688e-06, "loss": -0.1003, "num_tokens": 36578784.0, "reward": 0.5639881491661072, "reward_std": 0.3368771970272064, "rewards/unified_reward/mean": 0.5639881491661072, "rewards/unified_reward/std": 0.3368772268295288, "sampling/importance_sampling_ratio/max": 2.9535720348358154, "sampling/importance_sampling_ratio/mean": 0.8307265043258667, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8723225593566895, "sampling/sampling_logp_difference/mean": 0.026131782680749893, "step": 246, "step_time": 33.97451909980737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 200.48214721679688, "completions/mean_terminated_length": 77.31428527832031, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2606536149978638, "epoch": 0.03142893497900496, "frac_reward_zero_std": 0.2857142984867096, "grad_norm": 2.076927423477173, "kl": 0.5841810777783394, "learning_rate": 9.68698307672732e-06, "loss": 0.2617, "num_tokens": 36663614.0, "reward": 0.4308035969734192, "reward_std": 0.4297333061695099, "rewards/unified_reward/mean": 0.4308035671710968, "rewards/unified_reward/std": 0.4297333359718323, "sampling/importance_sampling_ratio/max": 2.878458261489868, "sampling/importance_sampling_ratio/mean": 0.9581747055053711, "sampling/importance_sampling_ratio/min": 0.00720991799607873, "sampling/sampling_logp_difference/max": 0.36945199966430664, "sampling/sampling_logp_difference/mean": 0.027164895087480545, "step": 247, "step_time": 33.98137942282483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1785714328289032, "completions/max_length": 2048.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 496.0089416503906, "completions/mean_terminated_length": 158.61956787109375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.169854372739792, "epoch": 0.031556177630741825, "frac_reward_zero_std": 0.3571428656578064, "grad_norm": 0.37681010365486145, "kl": 0.6012615263462067, "learning_rate": 9.685710650209951e-06, "loss": -0.0265, "num_tokens": 36746063.0, "reward": 0.6071428656578064, "reward_std": 0.3875613212585449, "rewards/unified_reward/mean": 0.6071428656578064, "rewards/unified_reward/std": 0.3875613212585449, "sampling/importance_sampling_ratio/max": 2.8741419315338135, "sampling/importance_sampling_ratio/mean": 0.7529699206352234, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8429465293884277, "sampling/sampling_logp_difference/mean": 0.02418946847319603, "step": 248, "step_time": 26.54506941791624 }, { "clip_ratio/high_max": 0.0015045180916786194, "clip_ratio/high_mean": 0.00021493115127668716, "clip_ratio/low_mean": 5.23823473486118e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022016938601154834, "completions/clipped_ratio": 0.2857142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 764.1607666015625, "completions/mean_terminated_length": 250.625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.4621182084083557, "epoch": 0.03168342028247869, "frac_reward_zero_std": 0.4285714626312256, "grad_norm": 0.4382263422012329, "kl": 0.2859679237008095, "learning_rate": 9.684438223692583e-06, "loss": 0.1058, "num_tokens": 36863385.0, "reward": 0.629464328289032, "reward_std": 0.2677434980869293, "rewards/unified_reward/mean": 0.6294642686843872, "rewards/unified_reward/std": 0.2677435278892517, "sampling/importance_sampling_ratio/max": 2.7362120151519775, "sampling/importance_sampling_ratio/mean": 0.7046492695808411, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7956228256225586, "sampling/sampling_logp_difference/mean": 0.025902118533849716, "step": 249, "step_time": 33.199072637129575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4107142984867096, "completions/max_length": 2048.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 976.6339721679688, "completions/mean_terminated_length": 229.92425537109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7013875246047974, "epoch": 0.03181066293421555, "frac_reward_zero_std": 0.5714285969734192, "grad_norm": 0.192351832985878, "kl": 0.3537377864122391, "learning_rate": 9.683165797175214e-06, "loss": -0.0879, "num_tokens": 37004832.0, "reward": 0.5648810267448425, "reward_std": 0.405068963766098, "rewards/unified_reward/mean": 0.5648809671401978, "rewards/unified_reward/std": 0.405068963766098, "sampling/importance_sampling_ratio/max": 2.8644843101501465, "sampling/importance_sampling_ratio/mean": 0.7705313563346863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4924275875091553, "sampling/sampling_logp_difference/mean": 0.027169521898031235, "step": 250, "step_time": 34.47676112689078 } ], "logging_steps": 1, "max_steps": 7859, "num_input_tokens_seen": 37004832, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }