[ { "step": 1, "loss": -0.0, "grad_norm": 0.0, "learning_rate": 0.0, "num_tokens": 6155.0, "completions/mean_length": 18.5, "completions/min_length": 13.0, "completions/max_length": 24.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 18.5, "completions/min_terminated_length": 13.0, "completions/max_terminated_length": 24.0, "rewards/reward_total/mean": 0.8737499713897705, "rewards/reward_total/std": 0.014071441255509853, "rewards/reward_market/mean": 0.6000000238418579, "rewards/reward_market/std": 0.0, "rewards/reward_warehouse/mean": 0.20000000298023224, "rewards/reward_warehouse/std": 0.0, "rewards/reward_showroom/mean": 0.07375000417232513, "rewards/reward_showroom/std": 0.01407142635434866, "reward": 0.8737499713897705, "reward_std": 0.014071442186832428, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 4.520341396331787, "sampling/sampling_logp_difference/max": 27.174238204956055, "sampling/importance_sampling_ratio/min": 4.035991810979052e-40, "sampling/importance_sampling_ratio/mean": 3.371377950408304e-34, "sampling/importance_sampling_ratio/max": 6.742751768219281e-34, "entropy": 0.1811772882938385, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "step_time": 4.618328085001849, "epoch": 0.0033333333333333335 }, { "step": 2, "loss": 0.0, "grad_norm": 0.0, "learning_rate": 5.000000000000001e-07, "num_tokens": 13990.0, "completions/mean_length": 20.5, "completions/min_length": 13.0, "completions/max_length": 28.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 20.5, "completions/min_terminated_length": 13.0, "completions/max_terminated_length": 28.0, "rewards/reward_total/mean": 0.9020000100135803, "rewards/reward_total/std": 0.04058792069554329, "rewards/reward_market/mean": 0.20000000298023224, "rewards/reward_market/std": 0.0, "rewards/reward_warehouse/mean": 0.6000000238418579, "rewards/reward_warehouse/std": 0.0, "rewards/reward_showroom/mean": 0.10199999809265137, "rewards/reward_showroom/std": 0.04058793559670448, "reward": 0.9020000100135803, "reward_std": 0.04058792069554329, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 4.2938385009765625, "sampling/sampling_logp_difference/max": 26.646509170532227, "sampling/importance_sampling_ratio/min": 2.0038568039844884e-43, "sampling/importance_sampling_ratio/mean": 9.08055335179305e-34, "sampling/importance_sampling_ratio/max": 1.81611067035861e-33, "entropy": 0.11802829056978226, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "step_time": 6.2618235270019795, "epoch": 0.006666666666666667 }, { "step": 3, "loss": 0.0, "grad_norm": 0.0, "learning_rate": 1.0000000000000002e-06, "num_tokens": 27189.0, "completions/mean_length": 47.5, "completions/min_length": 40.0, "completions/max_length": 55.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 47.5, "completions/min_terminated_length": 40.0, "completions/max_terminated_length": 55.0, "rewards/reward_total/mean": 0.800000011920929, "rewards/reward_total/std": 0.0, "rewards/reward_market/mean": 0.20000000298023224, "rewards/reward_market/std": 0.0, "rewards/reward_warehouse/mean": 0.6000000238418579, "rewards/reward_warehouse/std": 0.0, "rewards/reward_showroom/mean": 0.0, "rewards/reward_showroom/std": 0.0, "reward": 0.800000011920929, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "sampling/sampling_logp_difference/mean": 2.252011299133301, "sampling/sampling_logp_difference/max": 28.722728729248047, "sampling/importance_sampling_ratio/min": 0.0, "sampling/importance_sampling_ratio/mean": 5.605193857299268e-45, "sampling/importance_sampling_ratio/max": 1.2611686178923354e-44, "entropy": 0.12017613649368286, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "step_time": 10.559301583001798, "epoch": 0.01 }, { "step": 4, "loss": 0.0, "grad_norm": 0.0, "learning_rate": 1.5e-06, "num_tokens": 32537.0, "completions/mean_length": 16.0, "completions/min_length": 16.0, "completions/max_length": 16.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 16.0, "completions/min_terminated_length": 16.0, "completions/max_terminated_length": 16.0, "rewards/reward_total/mean": 0.8726999759674072, "rewards/reward_total/std": 0.008343853987753391, "rewards/reward_market/mean": 0.20000000298023224, "rewards/reward_market/std": 0.0, "rewards/reward_warehouse/mean": 0.6000000238418579, "rewards/reward_warehouse/std": 0.0, "rewards/reward_showroom/mean": 0.07269999384880066, "rewards/reward_showroom/std": 0.008343859575688839, "reward": 0.8726999759674072, "reward_std": 0.008343853987753391, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 6.424993515014648, "sampling/sampling_logp_difference/max": 30.274734497070312, "sampling/importance_sampling_ratio/min": 1.401298464324817e-45, "sampling/importance_sampling_ratio/mean": 2.802596928649634e-45, "sampling/importance_sampling_ratio/max": 2.802596928649634e-45, "entropy": 0.17401638627052307, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "step_time": 1.9552808339976764, "epoch": 0.013333333333333334 }, { "step": 5, "loss": -0.0, "grad_norm": 0.0, "learning_rate": 2.0000000000000003e-06, "num_tokens": 41360.0, "completions/mean_length": 32.0, "completions/min_length": 12.0, "completions/max_length": 52.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 32.0, "completions/min_terminated_length": 12.0, "completions/max_terminated_length": 52.0, "rewards/reward_total/mean": 0.8379999995231628, "rewards/reward_total/std": 0.05374009534716606, "rewards/reward_market/mean": 0.6000000238418579, "rewards/reward_market/std": 0.0, "rewards/reward_warehouse/mean": 0.20000000298023224, "rewards/reward_warehouse/std": 0.0, "rewards/reward_showroom/mean": 0.03799999877810478, "rewards/reward_showroom/std": 0.053740113973617554, "reward": 0.8379999995231628, "reward_std": 0.05374009534716606, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 3.4955575466156006, "sampling/sampling_logp_difference/max": 31.43513298034668, "sampling/importance_sampling_ratio/min": 0.0, "sampling/importance_sampling_ratio/mean": 2.3879863577791495e-32, "sampling/importance_sampling_ratio/max": 4.775972715558299e-32, "entropy": 0.10594719648361206, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "step_time": 8.467385248000937, "epoch": 0.016666666666666666 }, { "step": 6, "loss": 0.0, "grad_norm": 0.0, "learning_rate": 2.5e-06, "num_tokens": 53669.0, "completions/mean_length": 47.0, "completions/min_length": 34.0, "completions/max_length": 60.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 47.0, "completions/min_terminated_length": 34.0, "completions/max_terminated_length": 60.0, "rewards/reward_total/mean": 0.800000011920929, "rewards/reward_total/std": 0.0, "rewards/reward_market/mean": 0.20000000298023224, "rewards/reward_market/std": 0.0, "rewards/reward_warehouse/mean": 0.6000000238418579, "rewards/reward_warehouse/std": 0.0, "rewards/reward_showroom/mean": 0.0, "rewards/reward_showroom/std": 0.0, "reward": 0.800000011920929, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "sampling/sampling_logp_difference/mean": 2.4156625270843506, "sampling/sampling_logp_difference/max": 28.952373504638672, "sampling/importance_sampling_ratio/min": 0.0, "sampling/importance_sampling_ratio/mean": 1.090479293292958e-33, "sampling/importance_sampling_ratio/max": 2.180958586585916e-33, "entropy": 0.1315789371728897, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "step_time": 8.83436526999867, "epoch": 0.02 }, { "step": 7, "loss": -0.0, "grad_norm": 0.0, "learning_rate": 3e-06, "num_tokens": 62446.0, "completions/mean_length": 26.0, "completions/min_length": 13.0, "completions/max_length": 39.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 26.0, "completions/min_terminated_length": 13.0, "completions/max_terminated_length": 39.0, "rewards/reward_total/mean": 0.8481500148773193, "rewards/reward_total/std": 0.024678032845258713, "rewards/reward_market/mean": 0.20000000298023224, "rewards/reward_market/std": 0.0, "rewards/reward_warehouse/mean": 0.6000000238418579, "rewards/reward_warehouse/std": 0.0, "rewards/reward_showroom/mean": 0.04814999923110008, "rewards/reward_showroom/std": 0.024678027257323265, "reward": 0.8481500148773193, "reward_std": 0.024678032845258713, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 3.9407880306243896, "sampling/sampling_logp_difference/max": 28.3719482421875, "sampling/importance_sampling_ratio/min": 0.0, "sampling/importance_sampling_ratio/mean": 6.8739474988531065e-34, "sampling/importance_sampling_ratio/max": 1.3747894997706213e-33, "entropy": 0.10619711875915527, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "step_time": 8.326133659000334, "epoch": 0.023333333333333334 }, { "step": 8, "loss": 0.0, "grad_norm": 0.0, "learning_rate": 3.5e-06, "num_tokens": 72064.0, "completions/mean_length": 29.5, "completions/min_length": 19.0, "completions/max_length": 40.0, "completions/clipped_ratio": 0.0, "completions/mean_terminated_length": 29.5, "completions/min_terminated_length": 19.0, "completions/max_terminated_length": 40.0, "rewards/reward_total/mean": 0.8438500165939331, "rewards/reward_total/std": 0.06201327219605446, "rewards/reward_market/mean": 0.20000000298023224, "rewards/reward_market/std": 0.0, "rewards/reward_warehouse/mean": 0.6000000238418579, "rewards/reward_warehouse/std": 0.0, "rewards/reward_showroom/mean": 0.04385000094771385, "rewards/reward_showroom/std": 0.06201326474547386, "reward": 0.8438500165939331, "reward_std": 0.06201327219605446, "frac_reward_zero_std": 0.0, "sampling/sampling_logp_difference/mean": 3.4995782375335693, "sampling/sampling_logp_difference/max": 30.46342658996582, "sampling/importance_sampling_ratio/min": 7.006492321624085e-45, "sampling/importance_sampling_ratio/mean": 8.407790785948902e-45, "sampling/importance_sampling_ratio/max": 9.80908925027372e-45, "entropy": 0.11579056829214096, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "step_time": 8.365506213998742, "epoch": 0.02666666666666667 }, { "step": 8, "train_runtime": 91.2182, "train_samples_per_second": 0.175, "train_steps_per_second": 0.088, "total_flos": 0.0, "train_loss": -7.903189287664419e-34, "epoch": 0.02666666666666667 } ]