{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.7160001886400076e-05, "eval_steps": 500, "global_step": 1179, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.147373962202437e-06, "epoch": 4.000000160000006e-08, "frac_reward_zero_std": 0.0, "grad_norm": 2.2506166375002579e-10, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 12965.0, "reward": 8.581249237060547, "reward_std": 12.602022171020508, "rewards/rollout_reward_func/mean": 8.581249237060547, "rewards/rollout_reward_func/std": 12.602022171020508, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.940695295223122e-08, "step": 1, "step_time": 10.139658490996226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.224938413064592e-06, "epoch": 8.000000320000012e-08, "frac_reward_zero_std": 0.0, "grad_norm": 2.7702032334708804e-10, "kl": 0.0, "learning_rate": 2.1322799999999998e-07, "loss": -0.0, "num_tokens": 25885.0, "reward": 5.743749618530273, "reward_std": 7.329298973083496, "rewards/rollout_reward_func/mean": 5.743749618530273, "rewards/rollout_reward_func/std": 7.329298973083496, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 2, "step_time": 8.208553140000731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2924239146959735e-06, "epoch": 1.200000048000002e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.7765065246931897e-10, "kl": 0.0, "learning_rate": 4.2645599999999996e-07, "loss": 0.0, "num_tokens": 38842.0, "reward": 7.5625, "reward_std": 8.587965965270996, "rewards/rollout_reward_func/mean": 7.5625, "rewards/rollout_reward_func/std": 8.587966918945312, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0430811414607888e-07, "step": 3, "step_time": 8.10875642701285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.261355689370248e-06, "epoch": 1.6000000640000024e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.951114630445062e-10, "kl": 0.0, "learning_rate": 6.39684e-07, "loss": 0.0, "num_tokens": 51794.0, "reward": 4.137499809265137, "reward_std": 4.753366947174072, "rewards/rollout_reward_func/mean": 4.137499809265137, "rewards/rollout_reward_func/std": 4.753366947174072, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 4, "step_time": 8.141264145997411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2046284442467368e-06, "epoch": 2.000000080000003e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.330223514146468e-10, "kl": 0.0, "learning_rate": 8.529119999999999e-07, "loss": -0.0, "num_tokens": 64767.0, "reward": 4.143750190734863, "reward_std": 5.902538299560547, "rewards/rollout_reward_func/mean": 4.143750190734863, "rewards/rollout_reward_func/std": 5.902538776397705, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 5, "step_time": 8.142842120985733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1769494225054586e-06, "epoch": 2.400000096000004e-07, "frac_reward_zero_std": 0.0, "grad_norm": 3.380425661614339e-10, "kl": 0.0, "learning_rate": 1.0661399999999999e-06, "loss": 0.0, "num_tokens": 77773.0, "reward": 4.143750190734863, "reward_std": 5.840658664703369, "rewards/rollout_reward_func/mean": 4.143750190734863, "rewards/rollout_reward_func/std": 5.840658187866211, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443989685247288e-08, "step": 6, "step_time": 8.05084348400851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1707839437112852e-06, "epoch": 2.8000001120000047e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.2923335452063043e-10, "kl": 0.0, "learning_rate": 1.279368e-06, "loss": 0.0, "num_tokens": 90721.0, "reward": 5.106249809265137, "reward_std": 7.8632025718688965, "rewards/rollout_reward_func/mean": 5.106249809265137, "rewards/rollout_reward_func/std": 7.863203048706055, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 7, "step_time": 7.99214888500137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.224168355269285e-06, "epoch": 3.200000128000005e-07, "frac_reward_zero_std": 0.0, "grad_norm": 3.217917043940588e-10, "kl": 0.0, "learning_rate": 1.492596e-06, "loss": -0.0, "num_tokens": 103693.0, "reward": 2.875, "reward_std": 2.2561028003692627, "rewards/rollout_reward_func/mean": 2.875, "rewards/rollout_reward_func/std": 2.256103038787842, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0430811414607888e-07, "step": 8, "step_time": 8.0552555470058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.191095944681365e-06, "epoch": 3.6000001440000055e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.61257238065582e-10, "kl": 0.0, "learning_rate": 1.7058239999999999e-06, "loss": 0.0, "num_tokens": 116683.0, "reward": 6.5, "reward_std": 8.90168571472168, "rewards/rollout_reward_func/mean": 6.5, "rewards/rollout_reward_func/std": 8.90168571472168, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443990395790024e-08, "step": 9, "step_time": 12.668257255994831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.304020028986997e-06, "epoch": 4.000000160000006e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.5434268580148967e-10, "kl": 0.0, "learning_rate": 1.919052e-06, "loss": 0.0, "num_tokens": 129591.0, "reward": 2.6750001907348633, "reward_std": 2.42885422706604, "rewards/rollout_reward_func/mean": 2.6750001907348633, "rewards/rollout_reward_func/std": 2.428854465484619, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 10, "step_time": 7.785435590012639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.341870867894613e-06, "epoch": 4.400000176000007e-07, "frac_reward_zero_std": 0.0, "grad_norm": 3.1585581372617355e-10, "kl": 0.0, "learning_rate": 2.1322799999999998e-06, "loss": 0.0, "num_tokens": 142527.0, "reward": 5.631250381469727, "reward_std": 8.285868644714355, "rewards/rollout_reward_func/mean": 5.631250381469727, "rewards/rollout_reward_func/std": 8.285868644714355, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.1175869474300271e-07, "step": 11, "step_time": 7.776518418984779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1238449789962033e-06, "epoch": 4.800000192000008e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.642757401805085e-10, "kl": 0.0, "learning_rate": 2.345508e-06, "loss": 0.0, "num_tokens": 155523.0, "reward": 3.6625001430511475, "reward_std": 5.915727615356445, "rewards/rollout_reward_func/mean": 3.6625001430511475, "rewards/rollout_reward_func/std": 5.915727615356445, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 12, "step_time": 7.799313233001158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.27558351184598e-06, "epoch": 5.200000208000009e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.5059265773563766e-10, "kl": 0.0, "learning_rate": 2.558736e-06, "loss": 0.0, "num_tokens": 168473.0, "reward": 5.71875, "reward_std": 7.320401668548584, "rewards/rollout_reward_func/mean": 5.71875, "rewards/rollout_reward_func/std": 7.3204026222229, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 13, "step_time": 8.00489694299904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2911972337169573e-06, "epoch": 5.600000224000009e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.365949935967393e-10, "kl": 0.0, "learning_rate": 2.771964e-06, "loss": -0.0, "num_tokens": 181387.0, "reward": 3.46875, "reward_std": 6.075274467468262, "rewards/rollout_reward_func/mean": 3.46875, "rewards/rollout_reward_func/std": 6.075274467468262, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.940695295223122e-08, "step": 14, "step_time": 8.072770524995576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.205810773148187e-06, "epoch": 6.00000024000001e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.913017882466562e-10, "kl": 0.0, "learning_rate": 2.985192e-06, "loss": 0.0, "num_tokens": 194372.0, "reward": 6.024999618530273, "reward_std": 11.15577507019043, "rewards/rollout_reward_func/mean": 6.024999618530273, "rewards/rollout_reward_func/std": 11.15577507019043, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443989685247288e-08, "step": 15, "step_time": 8.13582070999837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.171209587231715e-06, "epoch": 6.40000025600001e-07, "frac_reward_zero_std": 0.0, "grad_norm": 3.1727084848220954e-10, "kl": 0.0, "learning_rate": 3.19842e-06, "loss": -0.0, "num_tokens": 207324.0, "reward": 3.9250001907348633, "reward_std": 5.3733296394348145, "rewards/rollout_reward_func/mean": 3.9250001907348633, "rewards/rollout_reward_func/std": 5.373328685760498, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 16, "step_time": 8.143076849999488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2080026553794596e-06, "epoch": 6.80000027200001e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.506559404480413e-10, "kl": 0.0, "learning_rate": 3.4116479999999997e-06, "loss": 0.0, "num_tokens": 220284.0, "reward": 2.9000000953674316, "reward_std": 2.038954019546509, "rewards/rollout_reward_func/mean": 2.9000000953674316, "rewards/rollout_reward_func/std": 2.038954019546509, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 17, "step_time": 8.129682488004619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2901477620962396e-06, "epoch": 7.200000288000011e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.8409416485963845e-10, "kl": 0.0, "learning_rate": 3.624876e-06, "loss": 0.0, "num_tokens": 233186.0, "reward": 4.981249809265137, "reward_std": 7.910012245178223, "rewards/rollout_reward_func/mean": 4.981249809265137, "rewards/rollout_reward_func/std": 7.910012722015381, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.1175869474300271e-07, "step": 18, "step_time": 7.897965942000155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.243341356233941e-06, "epoch": 7.600000304000012e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.9443031346332305e-10, "kl": 0.0, "learning_rate": 3.838104e-06, "loss": 0.0, "num_tokens": 246169.0, "reward": 4.075000286102295, "reward_std": 5.637907981872559, "rewards/rollout_reward_func/mean": 4.075000286102295, "rewards/rollout_reward_func/std": 5.6379075050354, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.692342134963837e-08, "step": 19, "step_time": 7.8119173040031455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.204108341175015e-06, "epoch": 8.000000320000012e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.559313316830014e-10, "kl": 0.0, "learning_rate": 4.051332e-06, "loss": 0.0, "num_tokens": 259113.0, "reward": 4.537499904632568, "reward_std": 6.030906677246094, "rewards/rollout_reward_func/mean": 4.537499904632568, "rewards/rollout_reward_func/std": 6.030906677246094, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 20, "step_time": 8.049839682986203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.39338473306816e-06, "epoch": 8.400000336000013e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.8371055504905485e-10, "kl": 0.0, "learning_rate": 4.2645599999999995e-06, "loss": -0.0, "num_tokens": 272067.0, "reward": 4.356249809265137, "reward_std": 6.871677875518799, "rewards/rollout_reward_func/mean": 4.356249809265137, "rewards/rollout_reward_func/std": 6.871678829193115, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 21, "step_time": 8.071140481995826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2686437262109393e-06, "epoch": 8.800000352000014e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.1737614486205814e-10, "kl": 0.0, "learning_rate": 4.477787999999999e-06, "loss": -0.0, "num_tokens": 285001.0, "reward": 3.4750001430511475, "reward_std": 5.766512393951416, "rewards/rollout_reward_func/mean": 3.4750001430511475, "rewards/rollout_reward_func/std": 5.766512870788574, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0430811414607888e-07, "step": 22, "step_time": 8.060187639988726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2450321637279558e-06, "epoch": 9.200000368000014e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.3760182710219624e-10, "kl": 0.0, "learning_rate": 4.691016e-06, "loss": 0.0, "num_tokens": 297964.0, "reward": 3.53125, "reward_std": 4.667863845825195, "rewards/rollout_reward_func/mean": 3.53125, "rewards/rollout_reward_func/std": 4.6678643226623535, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 23, "step_time": 8.10125090200745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2681095970256138e-06, "epoch": 9.600000384000016e-07, "frac_reward_zero_std": 0.0, "grad_norm": 2.4426985434367054e-10, "kl": 0.0, "learning_rate": 4.904244e-06, "loss": 0.0, "num_tokens": 310894.0, "reward": 3.6750001907348633, "reward_std": 2.873905658721924, "rewards/rollout_reward_func/mean": 3.6750001907348633, "rewards/rollout_reward_func/std": 2.873905658721924, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 24, "step_time": 8.095876025996404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.357509714556727e-06, "epoch": 1.0000000400000016e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.671836918377579e-10, "kl": 0.0, "learning_rate": 5.117472e-06, "loss": -0.0, "num_tokens": 323861.0, "reward": 6.412499904632568, "reward_std": 5.991535663604736, "rewards/rollout_reward_func/mean": 6.412499904632568, "rewards/rollout_reward_func/std": 5.991535663604736, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 25, "step_time": 7.932311734002724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2321511607970024e-06, "epoch": 1.0400000416000018e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.3979777325221505e-10, "kl": 0.0, "learning_rate": 5.3307e-06, "loss": -0.0, "num_tokens": 336809.0, "reward": 5.775000095367432, "reward_std": 10.168152809143066, "rewards/rollout_reward_func/mean": 5.775000095367432, "rewards/rollout_reward_func/std": 10.168152809143066, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 26, "step_time": 8.11258359700878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2833924617771117e-06, "epoch": 1.0800000432000017e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.2138550154492407e-10, "kl": 0.0, "learning_rate": 5.543928e-06, "loss": 0.0, "num_tokens": 349740.0, "reward": 2.0374999046325684, "reward_std": 1.335102915763855, "rewards/rollout_reward_func/mean": 2.0374999046325684, "rewards/rollout_reward_func/std": 1.3351030349731445, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 27, "step_time": 8.12253959998634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0808627567703297e-06, "epoch": 1.1200000448000019e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.0215196183670514e-10, "kl": 0.0, "learning_rate": 5.757156e-06, "loss": 0.0, "num_tokens": 362717.0, "reward": 5.21875, "reward_std": 7.227652072906494, "rewards/rollout_reward_func/mean": 5.21875, "rewards/rollout_reward_func/std": 7.227652549743652, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443989685247288e-08, "step": 28, "step_time": 8.057426735002082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3609362358456565e-06, "epoch": 1.1600000464000019e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.4580598667611753e-10, "kl": 0.0, "learning_rate": 5.970384e-06, "loss": -0.0, "num_tokens": 375671.0, "reward": 4.949999809265137, "reward_std": 6.466735363006592, "rewards/rollout_reward_func/mean": 4.949999809265137, "rewards/rollout_reward_func/std": 6.46673583984375, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 29, "step_time": 8.141302144991641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2601195439619914e-06, "epoch": 1.200000048000002e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.654537505948241e-10, "kl": 0.0, "learning_rate": 6.183612e-06, "loss": 0.0, "num_tokens": 388614.0, "reward": 5.949999809265137, "reward_std": 6.748036861419678, "rewards/rollout_reward_func/mean": 5.949999809265137, "rewards/rollout_reward_func/std": 6.748036861419678, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 30, "step_time": 8.224401664985635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3286784198717214e-06, "epoch": 1.240000049600002e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.343687466321853e-10, "kl": 0.0, "learning_rate": 6.39684e-06, "loss": 0.0, "num_tokens": 401516.0, "reward": 3.137500286102295, "reward_std": 3.825855255126953, "rewards/rollout_reward_func/mean": 3.137500286102295, "rewards/rollout_reward_func/std": 3.825855016708374, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0430811414607888e-07, "step": 31, "step_time": 8.13419719599915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2904786476374284e-06, "epoch": 1.280000051200002e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.757659378627153e-10, "kl": 0.0, "learning_rate": 6.610068e-06, "loss": 0.0, "num_tokens": 414455.0, "reward": 4.650000095367432, "reward_std": 5.678380489349365, "rewards/rollout_reward_func/mean": 4.650000095367432, "rewards/rollout_reward_func/std": 5.678380489349365, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 32, "step_time": 8.097923758999968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2819805565177376e-06, "epoch": 1.3200000528000021e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.6081442561221024e-10, "kl": 0.0, "learning_rate": 6.8232959999999994e-06, "loss": 0.0, "num_tokens": 427421.0, "reward": 5.856249809265137, "reward_std": 8.356391906738281, "rewards/rollout_reward_func/mean": 5.856249809265137, "rewards/rollout_reward_func/std": 8.356391906738281, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189048455482407e-08, "step": 33, "step_time": 8.233863238012418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3036556058286806e-06, "epoch": 1.360000054400002e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.093580114299499e-10, "kl": 0.0, "learning_rate": 7.036523999999999e-06, "loss": 0.0, "num_tokens": 440361.0, "reward": 7.543750762939453, "reward_std": 7.973453521728516, "rewards/rollout_reward_func/mean": 7.543750762939453, "rewards/rollout_reward_func/std": 7.973453044891357, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 34, "step_time": 8.06096702900686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4005407510685473e-06, "epoch": 1.4000000560000023e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.535270604564488e-10, "kl": 0.0, "learning_rate": 7.249752e-06, "loss": 0.0, "num_tokens": 453327.0, "reward": 4.243750095367432, "reward_std": 10.696850776672363, "rewards/rollout_reward_func/mean": 4.243750095367432, "rewards/rollout_reward_func/std": 10.696850776672363, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.142422192401682e-07, "step": 35, "step_time": 8.088374522005324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2730302049467355e-06, "epoch": 1.4400000576000022e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.910288399160521e-10, "kl": 0.0, "learning_rate": 7.46298e-06, "loss": 0.0, "num_tokens": 466303.0, "reward": 5.793750286102295, "reward_std": 8.269418716430664, "rewards/rollout_reward_func/mean": 5.793750286102295, "rewards/rollout_reward_func/std": 8.269418716430664, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0430811414607888e-07, "step": 36, "step_time": 8.1433752229932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.295816216246749e-06, "epoch": 1.4800000592000024e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.2777879582491778e-10, "kl": 0.0, "learning_rate": 7.462979999999994e-06, "loss": 0.0, "num_tokens": 479258.0, "reward": 7.412500381469727, "reward_std": 9.089912414550781, "rewards/rollout_reward_func/mean": 7.412500381469727, "rewards/rollout_reward_func/std": 9.089912414550781, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 37, "step_time": 8.143913138002972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.214254948285088e-06, "epoch": 1.5200000608000024e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.0327454997026706e-10, "kl": 0.0, "learning_rate": 7.462979999999978e-06, "loss": -0.0, "num_tokens": 492215.0, "reward": 8.09375, "reward_std": 8.804959297180176, "rewards/rollout_reward_func/mean": 8.09375, "rewards/rollout_reward_func/std": 8.80495834350586, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 38, "step_time": 8.181225027990877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2486074300331893e-06, "epoch": 1.5600000624000025e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.1632035879525233e-10, "kl": 0.0, "learning_rate": 7.46297999999995e-06, "loss": 0.0, "num_tokens": 505171.0, "reward": 8.03125, "reward_std": 12.37490463256836, "rewards/rollout_reward_func/mean": 8.03125, "rewards/rollout_reward_func/std": 12.374906539916992, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0927516314040986e-07, "step": 39, "step_time": 8.082870781006932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2166646544974355e-06, "epoch": 1.6000000640000025e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.016261962418554e-10, "kl": 0.0, "learning_rate": 7.462979999999912e-06, "loss": -0.0, "num_tokens": 518135.0, "reward": 2.9437499046325684, "reward_std": 2.2756593227386475, "rewards/rollout_reward_func/mean": 2.9437499046325684, "rewards/rollout_reward_func/std": 2.2756593227386475, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 40, "step_time": 8.07810879999306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2340426255595958e-06, "epoch": 1.6400000656000027e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.7515595357741063e-10, "kl": 0.0, "learning_rate": 7.462979999999863e-06, "loss": -0.0, "num_tokens": 531111.0, "reward": 5.018750190734863, "reward_std": 6.816569805145264, "rewards/rollout_reward_func/mean": 5.018750190734863, "rewards/rollout_reward_func/std": 6.816569805145264, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 41, "step_time": 8.077403585986758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.301548391869801e-06, "epoch": 1.6800000672000026e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.629192974445971e-10, "kl": 0.0, "learning_rate": 7.462979999999801e-06, "loss": 0.0, "num_tokens": 544047.0, "reward": 3.1437501907348633, "reward_std": 2.625253915786743, "rewards/rollout_reward_func/mean": 3.1437501907348633, "rewards/rollout_reward_func/std": 2.625253915786743, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0430811414607888e-07, "step": 42, "step_time": 8.215774511001655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2332118305712356e-06, "epoch": 1.7200000688000028e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.394714704312406e-10, "kl": 0.0, "learning_rate": 7.462979999999729e-06, "loss": -0.0, "num_tokens": 556985.0, "reward": 5.037499904632568, "reward_std": 6.506598949432373, "rewards/rollout_reward_func/mean": 5.037499904632568, "rewards/rollout_reward_func/std": 6.506599426269531, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443989685247288e-08, "step": 43, "step_time": 8.113155868006288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2070949796670902e-06, "epoch": 1.7600000704000028e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.0356719915580754e-10, "kl": 0.0, "learning_rate": 7.462979999999646e-06, "loss": -0.0, "num_tokens": 569943.0, "reward": 5.543749809265137, "reward_std": 8.8424711227417, "rewards/rollout_reward_func/mean": 5.543749809265137, "rewards/rollout_reward_func/std": 8.8424711227417, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.940696005765858e-08, "step": 44, "step_time": 8.18038847000571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2251969795661353e-06, "epoch": 1.800000072000003e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.276266397593929e-10, "kl": 0.0, "learning_rate": 7.462979999999553e-06, "loss": -0.0, "num_tokens": 582905.0, "reward": 2.6812500953674316, "reward_std": 2.0868537425994873, "rewards/rollout_reward_func/mean": 2.6812500953674316, "rewards/rollout_reward_func/std": 2.0868537425994873, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 45, "step_time": 8.172537058999296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.126461495777221e-06, "epoch": 1.840000073600003e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.9566135650860303e-10, "kl": 0.0, "learning_rate": 7.4629799999994476e-06, "loss": -0.0, "num_tokens": 595861.0, "reward": 4.143750190734863, "reward_std": 5.028580665588379, "rewards/rollout_reward_func/mean": 4.143750190734863, "rewards/rollout_reward_func/std": 5.028580665588379, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.692342134963837e-08, "step": 46, "step_time": 8.225590021000244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3756794007567805e-06, "epoch": 1.880000075200003e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.272059012626727e-10, "kl": 0.0, "learning_rate": 7.4629799999993315e-06, "loss": -0.0, "num_tokens": 608784.0, "reward": 2.799999952316284, "reward_std": 2.4105327129364014, "rewards/rollout_reward_func/mean": 2.799999952316284, "rewards/rollout_reward_func/std": 2.4105324745178223, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 47, "step_time": 8.032029877984314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2466352902483777e-06, "epoch": 1.9200000768000032e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.3628641538108184e-10, "kl": 0.0, "learning_rate": 7.4629799999992045e-06, "loss": 0.0, "num_tokens": 621710.0, "reward": 3.7562499046325684, "reward_std": 5.928571701049805, "rewards/rollout_reward_func/mean": 3.7562499046325684, "rewards/rollout_reward_func/std": 5.928571701049805, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 48, "step_time": 8.037210571004834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1795614486563863e-06, "epoch": 1.960000078400003e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.990690750603875e-10, "kl": 0.0, "learning_rate": 7.462979999999067e-06, "loss": 0.0, "num_tokens": 634646.0, "reward": 3.356250047683716, "reward_std": 6.5518412590026855, "rewards/rollout_reward_func/mean": 3.356250047683716, "rewards/rollout_reward_func/std": 6.551840782165527, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 49, "step_time": 8.049588057991059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3670434643463523e-06, "epoch": 2.000000080000003e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.2815652972750797e-10, "kl": 0.0, "learning_rate": 7.462979999998917e-06, "loss": 0.0, "num_tokens": 647598.0, "reward": 4.087500095367432, "reward_std": 7.438088417053223, "rewards/rollout_reward_func/mean": 4.087500095367432, "rewards/rollout_reward_func/std": 7.438089370727539, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443990395790024e-08, "step": 50, "step_time": 8.1427171250034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4162345084732806e-06, "epoch": 2.040000081600003e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.006020710127899e-10, "kl": 0.0, "learning_rate": 7.462979999998757e-06, "loss": -0.0, "num_tokens": 660528.0, "reward": 6.987499237060547, "reward_std": 12.202287673950195, "rewards/rollout_reward_func/mean": 6.987499237060547, "rewards/rollout_reward_func/std": 12.202288627624512, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 51, "step_time": 8.185513424999954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.204840939157293e-06, "epoch": 2.0800000832000035e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.5932675451478815e-10, "kl": 0.0, "learning_rate": 7.462979999998585e-06, "loss": -0.0, "num_tokens": 673464.0, "reward": 7.187499523162842, "reward_std": 10.283052444458008, "rewards/rollout_reward_func/mean": 7.187499523162842, "rewards/rollout_reward_func/std": 10.283052444458008, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.940695295223122e-08, "step": 52, "step_time": 8.153271165996557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.148284792724553e-06, "epoch": 2.1200000848000035e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.4655666397421783e-10, "kl": 0.0, "learning_rate": 7.462979999998403e-06, "loss": 0.0, "num_tokens": 686471.0, "reward": 4.137500286102295, "reward_std": 3.9717118740081787, "rewards/rollout_reward_func/mean": 4.137500286102295, "rewards/rollout_reward_func/std": 3.971712112426758, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 53, "step_time": 8.189161438000156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.173215193579381e-06, "epoch": 2.1600000864000034e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.1191396687724193e-10, "kl": 0.0, "learning_rate": 7.46297999999821e-06, "loss": 0.0, "num_tokens": 699455.0, "reward": 5.806249618530273, "reward_std": 7.273739814758301, "rewards/rollout_reward_func/mean": 5.806249618530273, "rewards/rollout_reward_func/std": 7.273739814758301, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 54, "step_time": 8.14643972198246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2477370009710285e-06, "epoch": 2.2000000880000034e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.344578697854871e-10, "kl": 0.0, "learning_rate": 7.462979999998006e-06, "loss": 0.0, "num_tokens": 712404.0, "reward": 7.668749809265137, "reward_std": 9.769251823425293, "rewards/rollout_reward_func/mean": 7.668749809265137, "rewards/rollout_reward_func/std": 9.769251823425293, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 55, "step_time": 8.151540220009338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3658322163555567e-06, "epoch": 2.2400000896000038e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.7344174147181377e-10, "kl": 0.0, "learning_rate": 7.46297999999779e-06, "loss": -0.0, "num_tokens": 725321.0, "reward": 3.2312498092651367, "reward_std": 2.02902889251709, "rewards/rollout_reward_func/mean": 3.2312498092651367, "rewards/rollout_reward_func/std": 2.02902889251709, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 56, "step_time": 8.126186539011542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.111861675757609e-06, "epoch": 2.2800000912000037e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.9687965974467545e-10, "kl": 0.0, "learning_rate": 7.462979999997564e-06, "loss": -0.0, "num_tokens": 738279.0, "reward": 5.21875, "reward_std": 10.887988090515137, "rewards/rollout_reward_func/mean": 5.21875, "rewards/rollout_reward_func/std": 10.887988090515137, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 7.947285496356926e-08, "step": 57, "step_time": 8.17952255000273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2709741926973948e-06, "epoch": 2.3200000928000037e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.577542346227091e-10, "kl": 0.0, "learning_rate": 7.462979999997327e-06, "loss": -0.0, "num_tokens": 751220.0, "reward": 5.612500190734863, "reward_std": 6.084392547607422, "rewards/rollout_reward_func/mean": 5.612500190734863, "rewards/rollout_reward_func/std": 6.084392547607422, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 58, "step_time": 8.05492930499895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2888500552653568e-06, "epoch": 2.3600000944000037e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.6923438478654305e-10, "kl": 0.0, "learning_rate": 7.462979999997078e-06, "loss": 0.0, "num_tokens": 764192.0, "reward": 1.8187499046325684, "reward_std": 2.734280824661255, "rewards/rollout_reward_func/mean": 1.8187499046325684, "rewards/rollout_reward_func/std": 2.734281063079834, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 59, "step_time": 8.078666151006473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.20845195997299e-06, "epoch": 2.400000096000004e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.890305772496049e-10, "kl": 0.0, "learning_rate": 7.462979999996818e-06, "loss": 0.0, "num_tokens": 777118.0, "reward": 6.837499618530273, "reward_std": 9.17393970489502, "rewards/rollout_reward_func/mean": 6.837499618530273, "rewards/rollout_reward_func/std": 9.17393970489502, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 60, "step_time": 8.032573650008999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2033101032548075e-06, "epoch": 2.440000097600004e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.1740088335396877e-10, "kl": 0.0, "learning_rate": 7.462979999996547e-06, "loss": 0.0, "num_tokens": 790095.0, "reward": 6.1875, "reward_std": 8.505047798156738, "rewards/rollout_reward_func/mean": 6.1875, "rewards/rollout_reward_func/std": 8.505047798156738, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189048455482407e-08, "step": 61, "step_time": 8.3111981910115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2774626984300994e-06, "epoch": 2.480000099200004e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.4472804338593335e-10, "kl": 0.0, "learning_rate": 7.462979999996264e-06, "loss": -0.0, "num_tokens": 803054.0, "reward": 4.412499904632568, "reward_std": 6.77789306640625, "rewards/rollout_reward_func/mean": 4.412499904632568, "rewards/rollout_reward_func/std": 6.777893543243408, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 62, "step_time": 8.210483206014032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2413926785702643e-06, "epoch": 2.520000100800004e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.385658615100539e-10, "kl": 0.0, "learning_rate": 7.462979999995973e-06, "loss": 0.0, "num_tokens": 815964.0, "reward": 5.050000190734863, "reward_std": 7.505998134613037, "rewards/rollout_reward_func/mean": 5.050000190734863, "rewards/rollout_reward_func/std": 7.505997657775879, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 63, "step_time": 8.090018924995093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2609683583141305e-06, "epoch": 2.560000102400004e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.909211760382391e-10, "kl": 0.0, "learning_rate": 7.462979999995668e-06, "loss": -0.0, "num_tokens": 828914.0, "reward": 3.7249999046325684, "reward_std": 2.3850924968719482, "rewards/rollout_reward_func/mean": 3.7249999046325684, "rewards/rollout_reward_func/std": 2.3850927352905273, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 64, "step_time": 7.920073139001033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.231308855016323e-06, "epoch": 2.6000001040000043e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.7630195353900433e-10, "kl": 0.0, "learning_rate": 7.462979999995354e-06, "loss": 0.0, "num_tokens": 841863.0, "reward": 6.362499713897705, "reward_std": 12.080224990844727, "rewards/rollout_reward_func/mean": 6.362499713897705, "rewards/rollout_reward_func/std": 12.080224990844727, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 65, "step_time": 7.8936763659949065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2190363608842745e-06, "epoch": 2.6400001056000042e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.567661916419439e-10, "kl": 0.0, "learning_rate": 7.462979999995028e-06, "loss": -0.0, "num_tokens": 854821.0, "reward": 5.012500286102295, "reward_std": 7.028027534484863, "rewards/rollout_reward_func/mean": 5.012500286102295, "rewards/rollout_reward_func/std": 7.028027534484863, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 66, "step_time": 7.907164546006243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.399017034804274e-06, "epoch": 2.680000107200004e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.1175875769839934e-10, "kl": 0.0, "learning_rate": 7.4629799999946915e-06, "loss": -0.0, "num_tokens": 867788.0, "reward": 5.199999809265137, "reward_std": 6.923005104064941, "rewards/rollout_reward_func/mean": 5.199999809265137, "rewards/rollout_reward_func/std": 6.9230055809021, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 67, "step_time": 7.889362171998073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1461813020096088e-06, "epoch": 2.720000108800004e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.7721017148429894e-10, "kl": 0.0, "learning_rate": 7.462979999994343e-06, "loss": 0.0, "num_tokens": 880742.0, "reward": 12.218750953674316, "reward_std": 10.548189163208008, "rewards/rollout_reward_func/mean": 12.218750953674316, "rewards/rollout_reward_func/std": 10.548190116882324, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 68, "step_time": 8.066539535997435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.261436463868449e-06, "epoch": 2.7600001104000046e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.1303332148624463e-10, "kl": 0.0, "learning_rate": 7.462979999993983e-06, "loss": 0.0, "num_tokens": 893716.0, "reward": 3.950000047683716, "reward_std": 4.7255330085754395, "rewards/rollout_reward_func/mean": 3.950000047683716, "rewards/rollout_reward_func/std": 4.725533485412598, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 69, "step_time": 8.080204022990074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1333754034458252e-06, "epoch": 2.8000001120000045e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.127615583713549e-10, "kl": 0.0, "learning_rate": 7.462979999993614e-06, "loss": -0.0, "num_tokens": 906680.0, "reward": 5.625000476837158, "reward_std": 6.938731670379639, "rewards/rollout_reward_func/mean": 5.625000476837158, "rewards/rollout_reward_func/std": 6.938732624053955, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 70, "step_time": 8.03530565800611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.256255754673475e-06, "epoch": 2.8400001136000045e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.325590692242585e-10, "kl": 0.0, "learning_rate": 7.462979999993232e-06, "loss": 0.0, "num_tokens": 919610.0, "reward": 5.237500190734863, "reward_std": 6.962554454803467, "rewards/rollout_reward_func/mean": 5.237500190734863, "rewards/rollout_reward_func/std": 6.962554931640625, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 71, "step_time": 8.176789520010061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.181721853844465e-06, "epoch": 2.8800001152000044e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.6216995241412633e-10, "kl": 0.0, "learning_rate": 7.46297999999284e-06, "loss": -0.0, "num_tokens": 932587.0, "reward": 5.068750381469727, "reward_std": 6.466294288635254, "rewards/rollout_reward_func/mean": 5.068750381469727, "rewards/rollout_reward_func/std": 6.466294288635254, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.940695295223122e-08, "step": 72, "step_time": 8.059565533003479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1565580681226493e-06, "epoch": 2.920000116800005e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.1096175634459655e-10, "kl": 0.0, "learning_rate": 7.462979999992437e-06, "loss": 0.0, "num_tokens": 945555.0, "reward": 1.9250000715255737, "reward_std": 1.7506190538406372, "rewards/rollout_reward_func/mean": 1.9250000715255737, "rewards/rollout_reward_func/std": 1.7506189346313477, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443989685247288e-08, "step": 73, "step_time": 7.974465946004784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.261713603957105e-06, "epoch": 2.9600001184000048e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.0554930807724645e-10, "kl": 0.0, "learning_rate": 7.4629799999920225e-06, "loss": 0.0, "num_tokens": 958504.0, "reward": 4.1875, "reward_std": 5.739439487457275, "rewards/rollout_reward_func/mean": 4.1875, "rewards/rollout_reward_func/std": 5.739439964294434, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.692342134963837e-08, "step": 74, "step_time": 8.14016615399305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1604718654089083e-06, "epoch": 3.0000001200000047e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.103557411066049e-10, "kl": 0.0, "learning_rate": 7.462979999991596e-06, "loss": 0.0, "num_tokens": 971496.0, "reward": 3.9187498092651367, "reward_std": 4.57124662399292, "rewards/rollout_reward_func/mean": 3.9187498092651367, "rewards/rollout_reward_func/std": 4.57124662399292, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.692342134963837e-08, "step": 75, "step_time": 7.937908487998357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.315255983376119e-06, "epoch": 3.0400001216000047e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.453346414910129e-10, "kl": 0.0, "learning_rate": 7.462979999991161e-06, "loss": 0.0, "num_tokens": 984430.0, "reward": 4.518750190734863, "reward_std": 5.910467147827148, "rewards/rollout_reward_func/mean": 4.518750190734863, "rewards/rollout_reward_func/std": 5.910467624664307, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0679163864324437e-07, "step": 76, "step_time": 8.124580253985187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.302706718637637e-06, "epoch": 3.080000123200005e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.435546209156314e-10, "kl": 0.0, "learning_rate": 7.462979999990713e-06, "loss": 0.0, "num_tokens": 997355.0, "reward": 4.587500095367432, "reward_std": 5.492161273956299, "rewards/rollout_reward_func/mean": 4.587500095367432, "rewards/rollout_reward_func/std": 5.492161273956299, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443989685247288e-08, "step": 77, "step_time": 7.868618612003047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.188067782071812e-06, "epoch": 3.120000124800005e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.889025130237144e-10, "kl": 0.0, "learning_rate": 7.462979999990255e-06, "loss": 0.0, "num_tokens": 1010279.0, "reward": 4.268750190734863, "reward_std": 5.7856974601745605, "rewards/rollout_reward_func/mean": 4.268750190734863, "rewards/rollout_reward_func/std": 5.7856974601745605, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 78, "step_time": 7.889343740003824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3347648152594047e-06, "epoch": 3.160000126400005e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.3746365984678164e-10, "kl": 0.0, "learning_rate": 7.4629799999897855e-06, "loss": 0.0, "num_tokens": 1023219.0, "reward": 4.625, "reward_std": 7.21530818939209, "rewards/rollout_reward_func/mean": 4.625, "rewards/rollout_reward_func/std": 7.21530818939209, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0430811414607888e-07, "step": 79, "step_time": 8.174822630011477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4136923570949875e-06, "epoch": 3.200000128000005e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.0964542041544973e-10, "kl": 0.0, "learning_rate": 7.462979999989305e-06, "loss": 0.0, "num_tokens": 1036177.0, "reward": 3.2437500953674316, "reward_std": 3.0197060108184814, "rewards/rollout_reward_func/mean": 3.2437500953674316, "rewards/rollout_reward_func/std": 3.0197064876556396, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 80, "step_time": 7.866260816001159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2161621870964154e-06, "epoch": 3.2400001296000054e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.518316111199681e-10, "kl": 0.0, "learning_rate": 7.462979999988813e-06, "loss": 0.0, "num_tokens": 1049097.0, "reward": 5.399999618530273, "reward_std": 8.018229484558105, "rewards/rollout_reward_func/mean": 5.399999618530273, "rewards/rollout_reward_func/std": 8.018229484558105, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 81, "step_time": 8.000104882004962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3030814872981864e-06, "epoch": 3.2800001312000053e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.8128463447352203e-10, "kl": 0.0, "learning_rate": 7.462979999988311e-06, "loss": 0.0, "num_tokens": 1062027.0, "reward": 3.1937501430511475, "reward_std": 1.9587304592132568, "rewards/rollout_reward_func/mean": 3.1937501430511475, "rewards/rollout_reward_func/std": 1.9587305784225464, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 82, "step_time": 7.840322085983644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1507327687686484e-06, "epoch": 3.3200001328000053e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.4643154183934257e-10, "kl": 0.0, "learning_rate": 7.462979999987797e-06, "loss": 0.0, "num_tokens": 1074999.0, "reward": 4.981249809265137, "reward_std": 6.774384498596191, "rewards/rollout_reward_func/mean": 4.981249809265137, "rewards/rollout_reward_func/std": 6.774385452270508, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 83, "step_time": 8.030197402003978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2753424389065913e-06, "epoch": 3.3600001344000052e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.222219908087908e-10, "kl": 0.0, "learning_rate": 7.4629799999872715e-06, "loss": -0.0, "num_tokens": 1087954.0, "reward": 3.143749952316284, "reward_std": 4.588676929473877, "rewards/rollout_reward_func/mean": 3.143749952316284, "rewards/rollout_reward_func/std": 4.588676929473877, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0927516314040986e-07, "step": 84, "step_time": 8.040010872995481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.366957801314129e-06, "epoch": 3.4000001360000056e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.0502292358569605e-10, "kl": 0.0, "learning_rate": 7.462979999986736e-06, "loss": 0.0, "num_tokens": 1100896.0, "reward": 9.600000381469727, "reward_std": 13.667285919189453, "rewards/rollout_reward_func/mean": 9.600000381469727, "rewards/rollout_reward_func/std": 13.667285919189453, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0430811414607888e-07, "step": 85, "step_time": 8.084725457003515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3620963247594773e-06, "epoch": 3.4400001376000056e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.5423754768105766e-10, "kl": 0.0, "learning_rate": 7.46297999998619e-06, "loss": -0.0, "num_tokens": 1113867.0, "reward": 5.431249618530273, "reward_std": 7.996058940887451, "rewards/rollout_reward_func/mean": 5.431249618530273, "rewards/rollout_reward_func/std": 7.996058940887451, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 86, "step_time": 8.080155628005741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.22918683334683e-06, "epoch": 3.4800001392000056e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.2422853291080855e-10, "kl": 0.0, "learning_rate": 7.462979999985631e-06, "loss": -0.0, "num_tokens": 1126853.0, "reward": 4.793749809265137, "reward_std": 6.586903095245361, "rewards/rollout_reward_func/mean": 4.793749809265137, "rewards/rollout_reward_func/std": 6.5869035720825195, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 87, "step_time": 12.148614273995918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2756235580345674e-06, "epoch": 3.5200001408000055e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.580160807230669e-10, "kl": 0.0, "learning_rate": 7.462979999985062e-06, "loss": -0.0, "num_tokens": 1139787.0, "reward": 4.862500190734863, "reward_std": 7.7284650802612305, "rewards/rollout_reward_func/mean": 4.862500190734863, "rewards/rollout_reward_func/std": 7.7284650802612305, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 88, "step_time": 8.108581022010185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2750947437089053e-06, "epoch": 3.560000142400006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.315489328053033e-10, "kl": 0.0, "learning_rate": 7.462979999984482e-06, "loss": 0.0, "num_tokens": 1152722.0, "reward": 4.699999809265137, "reward_std": 6.703233242034912, "rewards/rollout_reward_func/mean": 4.699999809265137, "rewards/rollout_reward_func/std": 6.703233242034912, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 89, "step_time": 8.096221604995662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.301213072541941e-06, "epoch": 3.600000144000006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.2276692429888953e-10, "kl": 0.0, "learning_rate": 7.462979999983891e-06, "loss": -0.0, "num_tokens": 1165675.0, "reward": 3.96875, "reward_std": 4.291187286376953, "rewards/rollout_reward_func/mean": 3.96875, "rewards/rollout_reward_func/std": 4.291187763214111, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 90, "step_time": 8.094183270986832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2586546890579484e-06, "epoch": 3.640000145600006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.415971589453392e-10, "kl": 0.0, "learning_rate": 7.462979999983289e-06, "loss": -0.0, "num_tokens": 1178645.0, "reward": 3.643749952316284, "reward_std": 6.126115798950195, "rewards/rollout_reward_func/mean": 3.643749952316284, "rewards/rollout_reward_func/std": 6.1261162757873535, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.940695295223122e-08, "step": 91, "step_time": 8.039611020023585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.330818176687899e-06, "epoch": 3.680000147200006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.619621186639165e-10, "kl": 0.0, "learning_rate": 7.462979999982676e-06, "loss": 0.0, "num_tokens": 1191625.0, "reward": 6.24375057220459, "reward_std": 9.606106758117676, "rewards/rollout_reward_func/mean": 6.24375057220459, "rewards/rollout_reward_func/std": 9.606107711791992, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 92, "step_time": 8.10697340300976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.201744678131945e-06, "epoch": 3.7200001488000058e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.889094241620427e-10, "kl": 0.0, "learning_rate": 7.462979999982051e-06, "loss": -0.0, "num_tokens": 1204575.0, "reward": 6.493749618530273, "reward_std": 8.0742769241333, "rewards/rollout_reward_func/mean": 6.493749618530273, "rewards/rollout_reward_func/std": 8.074277877807617, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 93, "step_time": 8.027821024006698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.379713521349913e-06, "epoch": 3.760000150400006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.577562052685778e-10, "kl": 0.0, "learning_rate": 7.462979999981416e-06, "loss": -0.0, "num_tokens": 1217508.0, "reward": 3.6062498092651367, "reward_std": 1.8964769840240479, "rewards/rollout_reward_func/mean": 3.6062498092651367, "rewards/rollout_reward_func/std": 1.8964769840240479, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 94, "step_time": 8.017703307006741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2402786896691396e-06, "epoch": 3.800000152000006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 5.425858651264548e-10, "kl": 0.0, "learning_rate": 7.462979999980769e-06, "loss": -0.0, "num_tokens": 1230448.0, "reward": 2.4250001907348633, "reward_std": 4.450842380523682, "rewards/rollout_reward_func/mean": 2.4250001907348633, "rewards/rollout_reward_func/std": 4.45084285736084, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.692343556049309e-08, "step": 95, "step_time": 8.103193704999285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3408398135416064e-06, "epoch": 3.8400001536000065e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.867907855641505e-10, "kl": 0.0, "learning_rate": 7.462979999980113e-06, "loss": 0.0, "num_tokens": 1243375.0, "reward": 4.349999904632568, "reward_std": 4.269425868988037, "rewards/rollout_reward_func/mean": 4.349999904632568, "rewards/rollout_reward_func/std": 4.269426345825195, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 96, "step_time": 8.084843101998558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.282585796820058e-06, "epoch": 3.8800001552000064e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.542280275186215e-10, "kl": 0.0, "learning_rate": 7.462979999979444e-06, "loss": 0.0, "num_tokens": 1256302.0, "reward": 4.9375, "reward_std": 9.227269172668457, "rewards/rollout_reward_func/mean": 4.9375, "rewards/rollout_reward_func/std": 9.227269172668457, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 97, "step_time": 8.091440905991476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2799605972068093e-06, "epoch": 3.920000156800006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.9958796554652167e-10, "kl": 0.0, "learning_rate": 7.462979999978765e-06, "loss": -0.0, "num_tokens": 1269257.0, "reward": 2.6687498092651367, "reward_std": 1.9625557661056519, "rewards/rollout_reward_func/mean": 2.6687498092651367, "rewards/rollout_reward_func/std": 1.9625557661056519, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 98, "step_time": 8.063235980982427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.257223087553939e-06, "epoch": 3.960000158400006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.3503332613472594e-10, "kl": 0.0, "learning_rate": 7.4629799999780735e-06, "loss": 0.0, "num_tokens": 1282241.0, "reward": 7.443750381469727, "reward_std": 8.367315292358398, "rewards/rollout_reward_func/mean": 7.443750381469727, "rewards/rollout_reward_func/std": 8.367315292358398, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 99, "step_time": 8.04980279901065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3053824236285436e-06, "epoch": 4.000000160000006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.4929516784233385e-10, "kl": 0.0, "learning_rate": 7.462979999977373e-06, "loss": 0.0, "num_tokens": 1295172.0, "reward": 2.6937499046325684, "reward_std": 3.117150068283081, "rewards/rollout_reward_func/mean": 2.6937499046325684, "rewards/rollout_reward_func/std": 3.11715030670166, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0679163864324437e-07, "step": 100, "step_time": 8.060301945995889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2856690122807777e-06, "epoch": 4.040000161600006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.9671001766651273e-10, "kl": 0.0, "learning_rate": 7.46297999997666e-06, "loss": 0.0, "num_tokens": 1308158.0, "reward": 3.5812501907348633, "reward_std": 2.418599843978882, "rewards/rollout_reward_func/mean": 3.5812501907348633, "rewards/rollout_reward_func/std": 2.418599843978882, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 101, "step_time": 8.039358419016935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.310565861307623e-06, "epoch": 4.080000163200006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.4645405161116685e-10, "kl": 0.0, "learning_rate": 7.462979999975936e-06, "loss": 0.0, "num_tokens": 1321088.0, "reward": 4.462500095367432, "reward_std": 6.804495811462402, "rewards/rollout_reward_func/mean": 4.462500095367432, "rewards/rollout_reward_func/std": 6.804496765136719, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 102, "step_time": 8.129766544006998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.186834194617404e-06, "epoch": 4.120000164800006e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.6589498098038575e-10, "kl": 0.0, "learning_rate": 7.462979999975201e-06, "loss": -0.0, "num_tokens": 1334026.0, "reward": 6.962499618530273, "reward_std": 6.878844261169434, "rewards/rollout_reward_func/mean": 6.962499618530273, "rewards/rollout_reward_func/std": 6.878844261169434, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 103, "step_time": 7.996518954008934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0881729199118126e-06, "epoch": 4.160000166400007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.623156691861084e-10, "kl": 0.0, "learning_rate": 7.462979999974455e-06, "loss": -0.0, "num_tokens": 1346966.0, "reward": 3.375, "reward_std": 5.159005165100098, "rewards/rollout_reward_func/mean": 3.375, "rewards/rollout_reward_func/std": 5.159005641937256, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 104, "step_time": 8.095499073999235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2200474916189705e-06, "epoch": 4.200000168000007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.3260777465571323e-10, "kl": 0.0, "learning_rate": 7.462979999973699e-06, "loss": 0.0, "num_tokens": 1359933.0, "reward": 3.5875000953674316, "reward_std": 4.384651184082031, "rewards/rollout_reward_func/mean": 3.5875000953674316, "rewards/rollout_reward_func/std": 4.3846516609191895, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 105, "step_time": 8.12141504300962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4155849303042487e-06, "epoch": 4.240000169600007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.3943900468180743e-10, "kl": 0.0, "learning_rate": 7.462979999972931e-06, "loss": 0.0, "num_tokens": 1372865.0, "reward": 5.675000190734863, "reward_std": 8.275384902954102, "rewards/rollout_reward_func/mean": 5.675000190734863, "rewards/rollout_reward_func/std": 8.275384902954102, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0679163864324437e-07, "step": 106, "step_time": 8.150542536983266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.319932676186909e-06, "epoch": 4.280000171200007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.871063942144758e-10, "kl": 0.0, "learning_rate": 7.462979999972152e-06, "loss": 0.0, "num_tokens": 1385815.0, "reward": 4.293750286102295, "reward_std": 7.921655178070068, "rewards/rollout_reward_func/mean": 4.293750286102295, "rewards/rollout_reward_func/std": 7.921655178070068, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0430811414607888e-07, "step": 107, "step_time": 8.090079850997427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1958415175049595e-06, "epoch": 4.320000172800007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.904452511831579e-10, "kl": 0.0, "learning_rate": 7.4629799999713625e-06, "loss": -0.0, "num_tokens": 1398782.0, "reward": 3.625, "reward_std": 1.6171993017196655, "rewards/rollout_reward_func/mean": 3.625, "rewards/rollout_reward_func/std": 1.617199182510376, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 108, "step_time": 8.071905244993104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3396056434421553e-06, "epoch": 4.360000174400007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.950871491602669e-10, "kl": 0.0, "learning_rate": 7.462979999970561e-06, "loss": -0.0, "num_tokens": 1411762.0, "reward": 9.931249618530273, "reward_std": 10.160558700561523, "rewards/rollout_reward_func/mean": 9.931249618530273, "rewards/rollout_reward_func/std": 10.16055965423584, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 109, "step_time": 7.858738459988672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.319572701026118e-06, "epoch": 4.400000176000007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.9594801609356125e-10, "kl": 0.0, "learning_rate": 7.462979999969749e-06, "loss": 0.0, "num_tokens": 1424676.0, "reward": 6.06874942779541, "reward_std": 7.832387447357178, "rewards/rollout_reward_func/mean": 6.06874942779541, "rewards/rollout_reward_func/std": 7.832387924194336, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.1175869474300271e-07, "step": 110, "step_time": 7.830410494003445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0808243448300345e-06, "epoch": 4.440000177600007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.549327138279267e-10, "kl": 0.0, "learning_rate": 7.4629799999689256e-06, "loss": 0.0, "num_tokens": 1437657.0, "reward": 4.837500095367432, "reward_std": 5.920233249664307, "rewards/rollout_reward_func/mean": 4.837500095367432, "rewards/rollout_reward_func/std": 5.920234203338623, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443989685247288e-08, "step": 111, "step_time": 7.83353382700443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3792459558080736e-06, "epoch": 4.4800001792000076e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.883200067582692e-10, "kl": 0.0, "learning_rate": 7.462979999968091e-06, "loss": -0.0, "num_tokens": 1450628.0, "reward": 4.050000190734863, "reward_std": 5.743982791900635, "rewards/rollout_reward_func/mean": 4.050000190734863, "rewards/rollout_reward_func/std": 5.743982791900635, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 112, "step_time": 7.815977327001747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.278642114106333e-06, "epoch": 4.5200001808000075e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.467699183357098e-10, "kl": 0.0, "learning_rate": 7.462979999967246e-06, "loss": 0.0, "num_tokens": 1463607.0, "reward": 2.8062500953674316, "reward_std": 5.261998176574707, "rewards/rollout_reward_func/mean": 2.8062500953674316, "rewards/rollout_reward_func/std": 5.261998653411865, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 113, "step_time": 8.128304762009066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1180106557494582e-06, "epoch": 4.5600001824000075e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.2149660711411343e-10, "kl": 0.0, "learning_rate": 7.46297999996639e-06, "loss": 0.0, "num_tokens": 1476595.0, "reward": 6.893750190734863, "reward_std": 8.794731140136719, "rewards/rollout_reward_func/mean": 6.893750190734863, "rewards/rollout_reward_func/std": 8.794732093811035, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.19563723553074e-08, "step": 114, "step_time": 8.060030290987925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.365838184914537e-06, "epoch": 4.6000001840000074e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.6841374395635285e-10, "kl": 0.0, "learning_rate": 7.462979999965523e-06, "loss": -0.0, "num_tokens": 1489530.0, "reward": 1.8624999523162842, "reward_std": 2.176809549331665, "rewards/rollout_reward_func/mean": 1.8624999523162842, "rewards/rollout_reward_func/std": 2.176809549331665, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0679163864324437e-07, "step": 115, "step_time": 8.051116536007612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2727255668542057e-06, "epoch": 4.640000185600007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.7880034392246955e-10, "kl": 0.0, "learning_rate": 7.4629799999646455e-06, "loss": 0.0, "num_tokens": 1502455.0, "reward": 4.643750190734863, "reward_std": 6.614022254943848, "rewards/rollout_reward_func/mean": 4.643750190734863, "rewards/rollout_reward_func/std": 6.614022254943848, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 116, "step_time": 8.16085835499689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2649231681270976e-06, "epoch": 4.680000187200007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.8578461819250833e-10, "kl": 0.0, "learning_rate": 7.462979999963755e-06, "loss": 0.0, "num_tokens": 1515429.0, "reward": 3.325000047683716, "reward_std": 2.6175053119659424, "rewards/rollout_reward_func/mean": 3.325000047683716, "rewards/rollout_reward_func/std": 2.6175053119659424, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 117, "step_time": 8.099111490992073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1682104716092e-06, "epoch": 4.720000188800007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.428127976461525e-10, "kl": 0.0, "learning_rate": 7.462979999962855e-06, "loss": 0.0, "num_tokens": 1528369.0, "reward": 4.456250190734863, "reward_std": 4.604341506958008, "rewards/rollout_reward_func/mean": 4.456250190734863, "rewards/rollout_reward_func/std": 4.604341506958008, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.940695295223122e-08, "step": 118, "step_time": 8.15359324500605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.201798707801572e-06, "epoch": 4.760000190400007e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.367274432035771e-10, "kl": 0.0, "learning_rate": 7.462979999961943e-06, "loss": -0.0, "num_tokens": 1541325.0, "reward": 3.5625, "reward_std": 5.243392467498779, "rewards/rollout_reward_func/mean": 3.5625, "rewards/rollout_reward_func/std": 5.2433929443359375, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 119, "step_time": 8.095369422990188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.242350291226103e-06, "epoch": 4.800000192000008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.5181889906633614e-10, "kl": 0.0, "learning_rate": 7.462979999961021e-06, "loss": -0.0, "num_tokens": 1554280.0, "reward": 4.887499809265137, "reward_std": 7.692582607269287, "rewards/rollout_reward_func/mean": 4.887499809265137, "rewards/rollout_reward_func/std": 7.692583084106445, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0927516314040986e-07, "step": 120, "step_time": 8.136039386998164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.19866421957704e-06, "epoch": 4.840000193600008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.827719169928855e-10, "kl": 0.0, "learning_rate": 7.462979999960088e-06, "loss": 0.0, "num_tokens": 1567247.0, "reward": 7.050000190734863, "reward_std": 12.40892219543457, "rewards/rollout_reward_func/mean": 7.050000190734863, "rewards/rollout_reward_func/std": 12.40892219543457, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 121, "step_time": 8.140484737996303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.254504252618972e-06, "epoch": 4.880000195200008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.652949804282656e-10, "kl": 0.0, "learning_rate": 7.462979999959143e-06, "loss": -0.0, "num_tokens": 1580166.0, "reward": 4.700000286102295, "reward_std": 5.75476598739624, "rewards/rollout_reward_func/mean": 4.700000286102295, "rewards/rollout_reward_func/std": 5.754766464233398, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 122, "step_time": 8.121054053997796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1939338807897e-06, "epoch": 4.920000196800008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.5250812552002344e-10, "kl": 0.0, "learning_rate": 7.462979999958188e-06, "loss": 0.0, "num_tokens": 1593130.0, "reward": 4.550000190734863, "reward_std": 6.0180840492248535, "rewards/rollout_reward_func/mean": 4.550000190734863, "rewards/rollout_reward_func/std": 6.018083572387695, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 123, "step_time": 8.14854699899297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1944028958387207e-06, "epoch": 4.960000198400008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.3471014021225756e-10, "kl": 0.0, "learning_rate": 7.46297999995722e-06, "loss": -0.0, "num_tokens": 1606083.0, "reward": 6.418749809265137, "reward_std": 7.421879768371582, "rewards/rollout_reward_func/mean": 6.418749809265137, "rewards/rollout_reward_func/std": 7.421879768371582, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 124, "step_time": 8.209908066004573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.300002392985334e-06, "epoch": 5.000000200000008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.2473856936832135e-10, "kl": 0.0, "learning_rate": 7.462979999956242e-06, "loss": -0.0, "num_tokens": 1619049.0, "reward": 1.556249976158142, "reward_std": 2.243499994277954, "rewards/rollout_reward_func/mean": 1.556249976158142, "rewards/rollout_reward_func/std": 2.243499994277954, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 125, "step_time": 8.124377987995103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2197532416612376e-06, "epoch": 5.040000201600008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.6582144818654285e-10, "kl": 0.0, "learning_rate": 7.462979999955254e-06, "loss": 0.0, "num_tokens": 1631942.0, "reward": 3.6125001907348633, "reward_std": 5.869568824768066, "rewards/rollout_reward_func/mean": 3.6125001907348633, "rewards/rollout_reward_func/std": 5.869568347930908, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 126, "step_time": 8.18675276099384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.372755659507675e-06, "epoch": 5.080000203200008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 4.045115631345908e-10, "kl": 0.0, "learning_rate": 7.462979999954253e-06, "loss": -0.0, "num_tokens": 1644887.0, "reward": 3.456249713897705, "reward_std": 5.599996089935303, "rewards/rollout_reward_func/mean": 3.456249713897705, "rewards/rollout_reward_func/std": 5.599997043609619, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0927516314040986e-07, "step": 127, "step_time": 8.261433876003139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.317912304761194e-06, "epoch": 5.120000204800008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.797322651293399e-10, "kl": 0.0, "learning_rate": 7.462979999953243e-06, "loss": 0.0, "num_tokens": 1657841.0, "reward": 4.399999618530273, "reward_std": 5.938686370849609, "rewards/rollout_reward_func/mean": 4.399999618530273, "rewards/rollout_reward_func/std": 5.938686847686768, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 128, "step_time": 8.248244262009393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1864728552145607e-06, "epoch": 5.160000206400009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.036590978666709e-10, "kl": 0.0, "learning_rate": 7.46297999995222e-06, "loss": 0.0, "num_tokens": 1670816.0, "reward": 3.575000286102295, "reward_std": 5.304652214050293, "rewards/rollout_reward_func/mean": 3.575000286102295, "rewards/rollout_reward_func/std": 5.304652214050293, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 129, "step_time": 8.096175769002002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.214307301073859e-06, "epoch": 5.2000002080000086e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.498594664501752e-10, "kl": 0.0, "learning_rate": 7.462979999951188e-06, "loss": 0.0, "num_tokens": 1683764.0, "reward": 4.393750190734863, "reward_std": 6.392544746398926, "rewards/rollout_reward_func/mean": 4.393750190734863, "rewards/rollout_reward_func/std": 6.392544269561768, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 130, "step_time": 8.128070815007959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.369142833913429e-06, "epoch": 5.2400002096000085e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.6320268187163265e-10, "kl": 0.0, "learning_rate": 7.4629799999501435e-06, "loss": 0.0, "num_tokens": 1696722.0, "reward": 5.931250095367432, "reward_std": 8.486987113952637, "rewards/rollout_reward_func/mean": 5.931250095367432, "rewards/rollout_reward_func/std": 8.486988067626953, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0927516314040986e-07, "step": 131, "step_time": 8.087185525997484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.261354595134435e-06, "epoch": 5.2800002112000085e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.0326197109076247e-10, "kl": 0.0, "learning_rate": 7.462979999949088e-06, "loss": -0.0, "num_tokens": 1709681.0, "reward": 5.256250381469727, "reward_std": 6.405724048614502, "rewards/rollout_reward_func/mean": 5.256250381469727, "rewards/rollout_reward_func/std": 6.405723571777344, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443989685247288e-08, "step": 132, "step_time": 8.080718965997221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.302639018125774e-06, "epoch": 5.3200002128000085e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.4121385444608734e-10, "kl": 0.0, "learning_rate": 7.462979999948022e-06, "loss": 0.0, "num_tokens": 1722627.0, "reward": 3.6937499046325684, "reward_std": 5.6276068687438965, "rewards/rollout_reward_func/mean": 3.6937499046325684, "rewards/rollout_reward_func/std": 5.6276068687438965, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.43740019465622e-08, "step": 133, "step_time": 8.072673355993174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2705606568251824e-06, "epoch": 5.360000214400008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.65145405631273e-10, "kl": 0.0, "learning_rate": 7.462979999946944e-06, "loss": 0.0, "num_tokens": 1735604.0, "reward": 2.8687498569488525, "reward_std": 3.703461170196533, "rewards/rollout_reward_func/mean": 2.8687498569488525, "rewards/rollout_reward_func/std": 3.7034614086151123, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 134, "step_time": 8.070155354020244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.20201786760299e-06, "epoch": 5.400000216000008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.3693710882177754e-10, "kl": 0.0, "learning_rate": 7.462979999945857e-06, "loss": 0.0, "num_tokens": 1748607.0, "reward": 4.612500190734863, "reward_std": 6.741302967071533, "rewards/rollout_reward_func/mean": 4.612500190734863, "rewards/rollout_reward_func/std": 6.741302967071533, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.940696005765858e-08, "step": 135, "step_time": 8.000138604998938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2351397888087376e-06, "epoch": 5.440000217600008e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.540260779504422e-10, "kl": 0.0, "learning_rate": 7.462979999944756e-06, "loss": -0.0, "num_tokens": 1761585.0, "reward": 3.84375, "reward_std": 5.636898994445801, "rewards/rollout_reward_func/mean": 3.84375, "rewards/rollout_reward_func/std": 5.636898994445801, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0182458254348603e-07, "step": 136, "step_time": 8.06034161599382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.23270913579654e-06, "epoch": 5.480000219200009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.795702558344715e-10, "kl": 0.0, "learning_rate": 7.4629799999436476e-06, "loss": -0.0, "num_tokens": 1774542.0, "reward": 5.8125, "reward_std": 9.576003074645996, "rewards/rollout_reward_func/mean": 5.8125, "rewards/rollout_reward_func/std": 9.576003074645996, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 137, "step_time": 8.051848074996087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2310120755264506e-06, "epoch": 5.520000220800009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.0132052408760046e-10, "kl": 0.0, "learning_rate": 7.462979999942525e-06, "loss": 0.0, "num_tokens": 1787473.0, "reward": 4.012499809265137, "reward_std": 4.70869779586792, "rewards/rollout_reward_func/mean": 4.012499809265137, "rewards/rollout_reward_func/std": 4.70869779586792, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 138, "step_time": 8.17314529199939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.271928281061264e-06, "epoch": 5.560000222400009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.2278626993509363e-10, "kl": 0.0, "learning_rate": 7.462979999941393e-06, "loss": -0.0, "num_tokens": 1800464.0, "reward": 3.1062498092651367, "reward_std": 3.1992642879486084, "rewards/rollout_reward_func/mean": 3.1062498092651367, "rewards/rollout_reward_func/std": 3.1992645263671875, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 139, "step_time": 8.089316319004865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.258756808259932e-06, "epoch": 5.600000224000009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.7064678276289555e-10, "kl": 0.0, "learning_rate": 7.462979999940249e-06, "loss": -0.0, "num_tokens": 1813424.0, "reward": 2.9624998569488525, "reward_std": 5.662846565246582, "rewards/rollout_reward_func/mean": 2.9624998569488525, "rewards/rollout_reward_func/std": 5.662846565246582, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443989685247288e-08, "step": 140, "step_time": 8.098462962996564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.217839579543579e-06, "epoch": 5.640000225600009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.8377467042872695e-10, "kl": 0.0, "learning_rate": 7.462979999939094e-06, "loss": -0.0, "num_tokens": 1826375.0, "reward": 3.924999952316284, "reward_std": 4.175563812255859, "rewards/rollout_reward_func/mean": 3.924999952316284, "rewards/rollout_reward_func/std": 4.175563812255859, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 141, "step_time": 8.169456928982981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1457159249393953e-06, "epoch": 5.680000227200009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.6908381078882826e-10, "kl": 0.0, "learning_rate": 7.462979999937929e-06, "loss": -0.0, "num_tokens": 1839369.0, "reward": 6.556249618530273, "reward_std": 8.188363075256348, "rewards/rollout_reward_func/mean": 6.556249618530273, "rewards/rollout_reward_func/std": 8.188364028930664, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443990395790024e-08, "step": 142, "step_time": 8.172915739000018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.222920585381871e-06, "epoch": 5.720000228800009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.2374550823212047e-10, "kl": 0.0, "learning_rate": 7.462979999936753e-06, "loss": 0.0, "num_tokens": 1852303.0, "reward": 4.824999809265137, "reward_std": 6.138023853302002, "rewards/rollout_reward_func/mean": 4.824999809265137, "rewards/rollout_reward_func/std": 6.138023376464844, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 143, "step_time": 8.206199316009588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.187829025501742e-06, "epoch": 5.760000230400009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.639554130823285e-10, "kl": 0.0, "learning_rate": 7.462979999935564e-06, "loss": -0.0, "num_tokens": 1865219.0, "reward": 5.074999809265137, "reward_std": 7.24941349029541, "rewards/rollout_reward_func/mean": 5.074999809265137, "rewards/rollout_reward_func/std": 7.249414443969727, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 144, "step_time": 8.095876464001776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2518543403293734e-06, "epoch": 5.80000023200001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.869902648861e-10, "kl": 0.0, "learning_rate": 7.462979999934366e-06, "loss": 0.0, "num_tokens": 1878135.0, "reward": 4.962499618530273, "reward_std": 7.240890979766846, "rewards/rollout_reward_func/mean": 4.962499618530273, "rewards/rollout_reward_func/std": 7.240890979766846, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 145, "step_time": 8.107579157003784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2898061615705956e-06, "epoch": 5.84000023360001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.135647574925571e-10, "kl": 0.0, "learning_rate": 7.462979999933157e-06, "loss": -0.0, "num_tokens": 1891091.0, "reward": 4.793749809265137, "reward_std": 9.808395385742188, "rewards/rollout_reward_func/mean": 4.793749809265137, "rewards/rollout_reward_func/std": 9.808395385742188, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 146, "step_time": 7.999451793002663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2120018456917023e-06, "epoch": 5.88000023520001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.416316313702538e-10, "kl": 0.0, "learning_rate": 7.462979999931936e-06, "loss": -0.0, "num_tokens": 1904067.0, "reward": 3.65625, "reward_std": 4.22973108291626, "rewards/rollout_reward_func/mean": 3.65625, "rewards/rollout_reward_func/std": 4.22973108291626, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.189047744939671e-08, "step": 147, "step_time": 8.163257640990196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.249324580816392e-06, "epoch": 5.9200002368000096e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.8757402015244793e-10, "kl": 0.0, "learning_rate": 7.462979999930703e-06, "loss": 0.0, "num_tokens": 1917028.0, "reward": 4.83750057220459, "reward_std": 7.650871276855469, "rewards/rollout_reward_func/mean": 4.83750057220459, "rewards/rollout_reward_func/std": 7.650871753692627, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 148, "step_time": 8.093987317006395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.105520238160352e-06, "epoch": 5.9600002384000095e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.5192206654089944e-10, "kl": 0.0, "learning_rate": 7.46297999992946e-06, "loss": 0.0, "num_tokens": 1930012.0, "reward": 3.375, "reward_std": 5.269978046417236, "rewards/rollout_reward_func/mean": 3.375, "rewards/rollout_reward_func/std": 5.269978046417236, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.93410651517479e-08, "step": 149, "step_time": 8.151090942992596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.330303061626182e-06, "epoch": 6.0000002400000095e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.66860256115109e-10, "kl": 0.0, "learning_rate": 7.462979999928206e-06, "loss": 0.0, "num_tokens": 1942975.0, "reward": 2.7375001907348633, "reward_std": 2.69094181060791, "rewards/rollout_reward_func/mean": 2.7375001907348633, "rewards/rollout_reward_func/std": 2.69094181060791, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 150, "step_time": 8.146212769002886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4565570129198022e-06, "epoch": 6.0400002416000095e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.842015511816953e-10, "kl": 0.0, "learning_rate": 7.4629799999269415e-06, "loss": 0.0, "num_tokens": 1955910.0, "reward": 3.9000000953674316, "reward_std": 7.267553329467773, "rewards/rollout_reward_func/mean": 3.9000000953674316, "rewards/rollout_reward_func/std": 7.267553806304932, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0927516314040986e-07, "step": 151, "step_time": 8.171023381997657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.187846092738255e-06, "epoch": 6.080000243200009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.655233255488554e-10, "kl": 0.0, "learning_rate": 7.462979999925666e-06, "loss": 0.0, "num_tokens": 1968883.0, "reward": 3.9437501430511475, "reward_std": 6.428060054779053, "rewards/rollout_reward_func/mean": 3.9437501430511475, "rewards/rollout_reward_func/std": 6.428061008453369, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.692342134963837e-08, "step": 152, "step_time": 8.091907752015686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.280047482372538e-06, "epoch": 6.120000244800009e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.478250105131252e-10, "kl": 0.0, "learning_rate": 7.462979999924378e-06, "loss": -0.0, "num_tokens": 1981820.0, "reward": 3.268749952316284, "reward_std": 5.756702899932861, "rewards/rollout_reward_func/mean": 3.268749952316284, "rewards/rollout_reward_func/std": 5.756702899932861, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 153, "step_time": 8.143722750006418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3073689305874723e-06, "epoch": 6.16000024640001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.1678815126667814e-10, "kl": 0.0, "learning_rate": 7.46297999992308e-06, "loss": 0.0, "num_tokens": 1994772.0, "reward": 2.1500000953674316, "reward_std": 1.364306926727295, "rewards/rollout_reward_func/mean": 2.1500000953674316, "rewards/rollout_reward_func/std": 1.3643070459365845, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 154, "step_time": 8.119664502002706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3186460111901397e-06, "epoch": 6.20000024800001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.9511140753335496e-10, "kl": 0.0, "learning_rate": 7.46297999992177e-06, "loss": 0.0, "num_tokens": 2007732.0, "reward": 8.100000381469727, "reward_std": 10.669208526611328, "rewards/rollout_reward_func/mean": 8.100000381469727, "rewards/rollout_reward_func/std": 10.669208526611328, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 155, "step_time": 8.161288703988248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.229249957963475e-06, "epoch": 6.24000024960001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.23155077749837e-10, "kl": 0.0, "learning_rate": 7.462979999920451e-06, "loss": -0.0, "num_tokens": 2020691.0, "reward": 2.518749952316284, "reward_std": 2.5974907875061035, "rewards/rollout_reward_func/mean": 2.518749952316284, "rewards/rollout_reward_func/std": 2.5974907875061035, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.000000238418579, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 1.0679163864324437e-07, "step": 156, "step_time": 8.23347092300537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1767494899904705e-06, "epoch": 6.28000025120001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.97562224860215e-10, "kl": 0.0, "learning_rate": 7.462979999919119e-06, "loss": -0.0, "num_tokens": 2033668.0, "reward": 5.787500381469727, "reward_std": 7.8658647537231445, "rewards/rollout_reward_func/mean": 5.787500381469727, "rewards/rollout_reward_func/std": 7.865865230560303, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.940695295223122e-08, "step": 157, "step_time": 8.243004686984932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.261601380837419e-06, "epoch": 6.32000025280001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 3.0819563567874297e-10, "kl": 0.0, "learning_rate": 7.4629799999177775e-06, "loss": -0.0, "num_tokens": 2046652.0, "reward": 2.3562498092651367, "reward_std": 1.787724256515503, "rewards/rollout_reward_func/mean": 2.3562498092651367, "rewards/rollout_reward_func/std": 1.787724256515503, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 8.443989685247288e-08, "step": 158, "step_time": 8.272544232997461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.302038637935766e-06, "epoch": 6.36000025440001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.420934286373466e-10, "kl": 0.0, "learning_rate": 7.462979999916423e-06, "loss": 0.0, "num_tokens": 2059648.0, "reward": 6.112499713897705, "reward_std": 8.85993766784668, "rewards/rollout_reward_func/mean": 6.112499713897705, "rewards/rollout_reward_func/std": 8.859938621520996, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.934105804632054e-08, "step": 159, "step_time": 8.230238098985865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2325926636312943e-06, "epoch": 6.40000025600001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 2.77389250458171e-10, "kl": 0.0, "learning_rate": 7.4629799999150585e-06, "loss": 0.0, "num_tokens": 2072619.0, "reward": 5.71875, "reward_std": 7.712519645690918, "rewards/rollout_reward_func/mean": 5.71875, "rewards/rollout_reward_func/std": 7.712520122528076, "sampling/importance_sampling_ratio/max": 1.0000003576278687, "sampling/importance_sampling_ratio/mean": 1.000000238418579, "sampling/importance_sampling_ratio/min": 1.0000001192092896, "sampling/sampling_logp_difference/max": 3.576278118089249e-07, "sampling/sampling_logp_difference/mean": 9.685753354915505e-08, "step": 160, "step_time": 8.214884514010919 }, { "clip_ratio/high_max": 0.000405844155466184, "clip_ratio/high_mean": 0.000405844155466184, "clip_ratio/low_mean": 0.0008499902469338849, "clip_ratio/low_min": 0.0008499902469338849, "clip_ratio/region_mean": 0.001255834402400069, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 301.0625, "completions/mean_terminated_length": 301.0625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.07531798351556063, "epoch": 6.44000025760001e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.00024292667512781918, "kl": 0.0006614619596803095, "learning_rate": 7.462979999913684e-06, "loss": 0.0, "num_tokens": 2090366.0, "reward": -3.048875093460083, "reward_std": 6.149080753326416, "rewards/rollout_reward_func/mean": -3.048875093460083, "rewards/rollout_reward_func/std": 6.149080276489258, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.40625, "sampling/sampling_logp_difference/mean": 0.4920227825641632, "step": 161, "step_time": 10.455447467007616 }, { "clip_ratio/high_max": 0.010406978952232748, "clip_ratio/high_mean": 0.010406978952232748, "clip_ratio/low_mean": 0.0076682733779307455, "clip_ratio/low_min": 0.0076682733779307455, "clip_ratio/region_mean": 0.018075252417474985, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.08232653513550758, "epoch": 6.480000259200011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.003013529581949115, "kl": 0.003964502859162167, "learning_rate": 7.462979999912297e-06, "loss": 0.0, "num_tokens": 2107889.0, "reward": -0.21806243062019348, "reward_std": 5.569501876831055, "rewards/rollout_reward_func/mean": -0.21806243062019348, "rewards/rollout_reward_func/std": 5.569501876831055, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.40625, "sampling/sampling_logp_difference/mean": 0.5227737426757812, "step": 162, "step_time": 10.377935386008176 }, { "clip_ratio/high_max": 0.0059829459933098406, "clip_ratio/high_mean": 0.0059829459933098406, "clip_ratio/low_mean": 0.006227041158126667, "clip_ratio/low_min": 0.006227041158126667, "clip_ratio/region_mean": 0.01220998726785183, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 298.3125, "completions/mean_terminated_length": 298.3125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.07516410760581493, "epoch": 6.520000260800011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008916485821828246, "kl": 0.005264940176857635, "learning_rate": 7.4629799999109e-06, "loss": 0.0, "num_tokens": 2125597.0, "reward": -1.2063125371932983, "reward_std": 6.699039459228516, "rewards/rollout_reward_func/mean": -1.2063125371932983, "rewards/rollout_reward_func/std": 6.699039936065674, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.1153678894043, "sampling/sampling_logp_difference/mean": 0.5089957118034363, "step": 163, "step_time": 10.451397179000196 }, { "clip_ratio/high_max": 0.006482795171905309, "clip_ratio/high_mean": 0.006482795171905309, "clip_ratio/low_mean": 0.00726001855218783, "clip_ratio/low_min": 0.00726001855218783, "clip_ratio/region_mean": 0.013742813549470156, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 308.0625, "completions/mean_terminated_length": 308.0625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.07950770854949951, "epoch": 6.560000262400011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009470506920479238, "kl": 0.0032599992700852454, "learning_rate": 7.46297999990949e-06, "loss": 0.0, "num_tokens": 2143496.0, "reward": 3.756437301635742, "reward_std": 11.231764793395996, "rewards/rollout_reward_func/mean": 3.756437301635742, "rewards/rollout_reward_func/std": 11.231765747070312, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.8125, "sampling/sampling_logp_difference/mean": 0.4679420590400696, "step": 164, "step_time": 10.427626744007284 }, { "clip_ratio/high_max": 0.01054448654758744, "clip_ratio/high_mean": 0.01054448654758744, "clip_ratio/low_mean": 0.0047625267470721155, "clip_ratio/low_min": 0.0047625267470721155, "clip_ratio/region_mean": 0.015307013352867216, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 299.0, "completions/mean_terminated_length": 299.0, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.07649955525994301, "epoch": 6.600000264000011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013597665820270777, "kl": 0.00486937616369687, "learning_rate": 7.462979999908071e-06, "loss": 0.0, "num_tokens": 2161203.0, "reward": -4.004687309265137, "reward_std": 7.0399250984191895, "rewards/rollout_reward_func/mean": -4.004687309265137, "rewards/rollout_reward_func/std": 7.039925575256348, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.9375114440918, "sampling/sampling_logp_difference/mean": 0.502415657043457, "step": 165, "step_time": 10.47943463599222 }, { "clip_ratio/high_max": 0.009512068776530214, "clip_ratio/high_mean": 0.009512068776530214, "clip_ratio/low_mean": 0.00405233126366511, "clip_ratio/low_min": 0.00405233126366511, "clip_ratio/region_mean": 0.013564399967435747, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 294.3125, "completions/mean_terminated_length": 294.3125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07197013031691313, "epoch": 6.6400002656000106e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009629783453419805, "kl": 0.004041691921884194, "learning_rate": 7.46297999990664e-06, "loss": 0.0, "num_tokens": 2178813.0, "reward": -4.371624946594238, "reward_std": 6.371729850769043, "rewards/rollout_reward_func/mean": -4.371624946594238, "rewards/rollout_reward_func/std": 6.371730327606201, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.750003814697266, "sampling/sampling_logp_difference/mean": 0.49506810307502747, "step": 166, "step_time": 14.656383432986331 }, { "clip_ratio/high_max": 0.007402752467896789, "clip_ratio/high_mean": 0.007402752467896789, "clip_ratio/low_mean": 0.005066068610176444, "clip_ratio/low_min": 0.005066068610176444, "clip_ratio/region_mean": 0.012468820787034929, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 296.375, "completions/mean_terminated_length": 296.375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.0752051966264844, "epoch": 6.6800002672000105e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010355449048802257, "kl": 0.00463852338725701, "learning_rate": 7.462979999905198e-06, "loss": 0.0, "num_tokens": 2196476.0, "reward": -2.0443124771118164, "reward_std": 2.7881104946136475, "rewards/rollout_reward_func/mean": -2.0443124771118164, "rewards/rollout_reward_func/std": 2.7881104946136475, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.5, "sampling/sampling_logp_difference/mean": 0.47134244441986084, "step": 167, "step_time": 10.56785387200216 }, { "clip_ratio/high_max": 0.004807441437151283, "clip_ratio/high_mean": 0.004807441437151283, "clip_ratio/low_mean": 0.008668693160871044, "clip_ratio/low_min": 0.008668693160871044, "clip_ratio/region_mean": 0.013476134627126157, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.073267194442451, "epoch": 6.7200002688000105e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010840960312634706, "kl": 0.003960417147027329, "learning_rate": 7.4629799999037455e-06, "loss": 0.0, "num_tokens": 2213943.0, "reward": 2.274625062942505, "reward_std": 12.055625915527344, "rewards/rollout_reward_func/mean": 2.274625062942505, "rewards/rollout_reward_func/std": 12.055625915527344, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.250003814697266, "sampling/sampling_logp_difference/mean": 0.5480209589004517, "step": 168, "step_time": 10.482453233998967 }, { "clip_ratio/high_max": 0.007080613577272743, "clip_ratio/high_mean": 0.007080613577272743, "clip_ratio/low_mean": 0.004649894806789234, "clip_ratio/low_min": 0.004649894806789234, "clip_ratio/region_mean": 0.011730508354958147, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 287.125, "completions/mean_terminated_length": 287.125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.07662129029631615, "epoch": 6.7600002704000105e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0006327596493065357, "kl": 0.002894752862630412, "learning_rate": 7.462979999902281e-06, "loss": 0.0, "num_tokens": 2231427.0, "reward": -0.8479374647140503, "reward_std": 2.946763753890991, "rewards/rollout_reward_func/mean": -0.8479374647140503, "rewards/rollout_reward_func/std": 2.946763753890991, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.90634536743164, "sampling/sampling_logp_difference/mean": 0.5399976968765259, "step": 169, "step_time": 10.423901763017057 }, { "clip_ratio/high_max": 0.008643045119242743, "clip_ratio/high_mean": 0.008643045119242743, "clip_ratio/low_mean": 0.004094432442798279, "clip_ratio/low_min": 0.004094432442798279, "clip_ratio/region_mean": 0.012737477605696768, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 295.5, "completions/mean_terminated_length": 295.5, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.07633415423333645, "epoch": 6.800000272000011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011058315867558122, "kl": 0.00394137913826853, "learning_rate": 7.462979999900806e-06, "loss": 0.0, "num_tokens": 2249057.0, "reward": -2.579124927520752, "reward_std": 4.0611572265625, "rewards/rollout_reward_func/mean": -2.579124927520752, "rewards/rollout_reward_func/std": 4.061157703399658, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.3125, "sampling/sampling_logp_difference/mean": 0.515461802482605, "step": 170, "step_time": 10.445612312993035 }, { "clip_ratio/high_max": 0.004945141670759767, "clip_ratio/high_mean": 0.004945141670759767, "clip_ratio/low_mean": 0.0068987825070507824, "clip_ratio/low_min": 0.0068987825070507824, "clip_ratio/region_mean": 0.011843924294225872, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.07040293188765645, "epoch": 6.840000273600011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.002311327029019594, "kl": 0.00609464653825853, "learning_rate": 7.46297999989932e-06, "loss": 0.0, "num_tokens": 2266704.0, "reward": 7.3870625495910645, "reward_std": 25.382442474365234, "rewards/rollout_reward_func/mean": 7.3870625495910645, "rewards/rollout_reward_func/std": 25.382442474365234, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.0625, "sampling/sampling_logp_difference/mean": 0.49431705474853516, "step": 171, "step_time": 10.513636074989336 }, { "clip_ratio/high_max": 0.0057162074372172356, "clip_ratio/high_mean": 0.0057162074372172356, "clip_ratio/low_mean": 0.0066111111373174936, "clip_ratio/low_min": 0.0066111111373174936, "clip_ratio/region_mean": 0.012327318545430899, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 283.1875, "completions/mean_terminated_length": 283.1875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.07699470594525337, "epoch": 6.880000275200011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010247996542602777, "kl": 0.0034955381706822664, "learning_rate": 7.462979999897823e-06, "loss": 0.0, "num_tokens": 2284126.0, "reward": -1.630750060081482, "reward_std": 4.766759395599365, "rewards/rollout_reward_func/mean": -1.630750060081482, "rewards/rollout_reward_func/std": 4.766759872436523, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.125, "sampling/sampling_logp_difference/mean": 0.5239483714103699, "step": 172, "step_time": 10.473675487999571 }, { "clip_ratio/high_max": 0.006885909766424447, "clip_ratio/high_mean": 0.006885909766424447, "clip_ratio/low_mean": 0.008742622681893408, "clip_ratio/low_min": 0.008742622681893408, "clip_ratio/region_mean": 0.015628532506525517, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07626987528055906, "epoch": 6.920000276800011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013048453256487846, "kl": 0.006234777858480811, "learning_rate": 7.462979999896315e-06, "loss": 0.0, "num_tokens": 2301315.0, "reward": 0.5512499809265137, "reward_std": 5.497782230377197, "rewards/rollout_reward_func/mean": 0.5512499809265137, "rewards/rollout_reward_func/std": 5.4977827072143555, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.898468017578125, "sampling/sampling_logp_difference/mean": 0.5337214469909668, "step": 173, "step_time": 10.10330947300099 }, { "clip_ratio/high_max": 0.004351581854280084, "clip_ratio/high_mean": 0.004351581854280084, "clip_ratio/low_mean": 0.007762177789118141, "clip_ratio/low_min": 0.007762177789118141, "clip_ratio/region_mean": 0.012113759585190564, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.07694732304662466, "epoch": 6.960000278400011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.001211977214552462, "kl": 0.005513776530278847, "learning_rate": 7.462979999894797e-06, "loss": 0.0, "num_tokens": 2318774.0, "reward": 5.431375026702881, "reward_std": 22.641254425048828, "rewards/rollout_reward_func/mean": 5.431375026702881, "rewards/rollout_reward_func/std": 22.64125633239746, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.6875, "sampling/sampling_logp_difference/mean": 0.5127522945404053, "step": 174, "step_time": 10.522105784992164 }, { "clip_ratio/high_max": 0.011238687031436712, "clip_ratio/high_mean": 0.011238687031436712, "clip_ratio/low_mean": 0.0047899863857310265, "clip_ratio/low_min": 0.0047899863857310265, "clip_ratio/region_mean": 0.016028673388063908, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.07810990139842033, "epoch": 7.000000280000011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.001191670773550868, "kl": 0.006333552795695141, "learning_rate": 7.462979999893266e-06, "loss": 0.0, "num_tokens": 2336276.0, "reward": -2.712750196456909, "reward_std": 11.328715324401855, "rewards/rollout_reward_func/mean": -2.712750196456909, "rewards/rollout_reward_func/std": 11.328715324401855, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.46875, "sampling/sampling_logp_difference/mean": 0.5164696574211121, "step": 175, "step_time": 10.483888187001867 }, { "clip_ratio/high_max": 0.008311323064845055, "clip_ratio/high_mean": 0.008311323064845055, "clip_ratio/low_mean": 0.008241371062467806, "clip_ratio/low_min": 0.008241371062467806, "clip_ratio/region_mean": 0.016552694141864777, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.07885020412504673, "epoch": 7.040000281600011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.001519530895166099, "kl": 0.005223864805884659, "learning_rate": 7.462979999891724e-06, "loss": 0.0, "num_tokens": 2353618.0, "reward": 0.8555000424385071, "reward_std": 8.95112133026123, "rewards/rollout_reward_func/mean": 0.8555000424385071, "rewards/rollout_reward_func/std": 8.951122283935547, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.187503814697266, "sampling/sampling_logp_difference/mean": 0.5218495726585388, "step": 176, "step_time": 10.316310858004726 }, { "clip_ratio/high_max": 0.006360433122608811, "clip_ratio/high_mean": 0.006360433122608811, "clip_ratio/low_mean": 0.007861283491365612, "clip_ratio/low_min": 0.007861283491365612, "clip_ratio/region_mean": 0.014221716672182083, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 294.875, "completions/mean_terminated_length": 294.875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.07272684248164296, "epoch": 7.080000283200011e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008048543822951615, "kl": 0.0037343741569202393, "learning_rate": 7.462979999890171e-06, "loss": 0.0, "num_tokens": 2371241.0, "reward": 4.884812355041504, "reward_std": 25.609315872192383, "rewards/rollout_reward_func/mean": 4.884812355041504, "rewards/rollout_reward_func/std": 25.609315872192383, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.125, "sampling/sampling_logp_difference/mean": 0.5129505395889282, "step": 177, "step_time": 10.418962989999272 }, { "clip_ratio/high_max": 0.006776359223295003, "clip_ratio/high_mean": 0.006776359223295003, "clip_ratio/low_mean": 0.007790409232256934, "clip_ratio/low_min": 0.007790409232256934, "clip_ratio/region_mean": 0.014566768310032785, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 291.6875, "completions/mean_terminated_length": 291.6875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.07525542937219143, "epoch": 7.120000284800012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0019036948215216398, "kl": 0.006873317179270089, "learning_rate": 7.462979999888608e-06, "loss": 0.0, "num_tokens": 2388811.0, "reward": 4.312375545501709, "reward_std": 23.36739730834961, "rewards/rollout_reward_func/mean": 4.312375545501709, "rewards/rollout_reward_func/std": 23.367399215698242, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.4375, "sampling/sampling_logp_difference/mean": 0.5202291011810303, "step": 178, "step_time": 10.441277626006922 }, { "clip_ratio/high_max": 0.00687522380030714, "clip_ratio/high_mean": 0.00687522380030714, "clip_ratio/low_mean": 0.00825929018901661, "clip_ratio/low_min": 0.00825929018901661, "clip_ratio/region_mean": 0.015134514309465885, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.07474346552044153, "epoch": 7.160000286400012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.002246933290734887, "kl": 0.005603231955319643, "learning_rate": 7.4629799998870344e-06, "loss": 0.0, "num_tokens": 2406258.0, "reward": -2.211750030517578, "reward_std": 9.932024002075195, "rewards/rollout_reward_func/mean": -2.211750030517578, "rewards/rollout_reward_func/std": 9.932024002075195, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 35.87500762939453, "sampling/sampling_logp_difference/mean": 0.5128257870674133, "step": 179, "step_time": 10.436197410992463 }, { "clip_ratio/high_max": 0.01030973409069702, "clip_ratio/high_mean": 0.01030973409069702, "clip_ratio/low_mean": 0.006350549258058891, "clip_ratio/low_min": 0.006350549258058891, "clip_ratio/region_mean": 0.016660283436067402, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.07370437448844314, "epoch": 7.200000288000012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026121200062334538, "kl": 0.007186158589320257, "learning_rate": 7.462979999885449e-06, "loss": 0.0, "num_tokens": 2423665.0, "reward": -1.440812349319458, "reward_std": 5.130565166473389, "rewards/rollout_reward_func/mean": -1.440812349319458, "rewards/rollout_reward_func/std": 5.130565166473389, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.484375, "sampling/sampling_logp_difference/mean": 0.5653854608535767, "step": 180, "step_time": 10.462989923013083 }, { "clip_ratio/high_max": 0.006423939485102892, "clip_ratio/high_mean": 0.006423939485102892, "clip_ratio/low_mean": 0.006873823353089392, "clip_ratio/low_min": 0.006873823353089392, "clip_ratio/region_mean": 0.013297762954607606, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.07518561650067568, "epoch": 7.240000289600012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008708724053576589, "kl": 0.0043790703639388084, "learning_rate": 7.462979999883852e-06, "loss": 0.0, "num_tokens": 2441078.0, "reward": 0.6349374055862427, "reward_std": 6.828274250030518, "rewards/rollout_reward_func/mean": 0.6349374055862427, "rewards/rollout_reward_func/std": 6.828274726867676, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.28125, "sampling/sampling_logp_difference/mean": 0.518650233745575, "step": 181, "step_time": 10.452838033990702 }, { "clip_ratio/high_max": 0.005383011302910745, "clip_ratio/high_mean": 0.005383011302910745, "clip_ratio/low_mean": 0.004584575857734308, "clip_ratio/low_min": 0.004584575857734308, "clip_ratio/region_mean": 0.009967587131541222, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 288.125, "completions/mean_terminated_length": 288.125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.0770668825134635, "epoch": 7.280000291200012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0018409843323752284, "kl": 0.005416554166004062, "learning_rate": 7.462979999882245e-06, "loss": 0.0, "num_tokens": 2458592.0, "reward": -0.11400008201599121, "reward_std": 6.976894855499268, "rewards/rollout_reward_func/mean": -0.11400008201599121, "rewards/rollout_reward_func/std": 6.976894855499268, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.125221252441406, "sampling/sampling_logp_difference/mean": 0.5222750306129456, "step": 182, "step_time": 10.405951548011217 }, { "clip_ratio/high_max": 0.008948912261985242, "clip_ratio/high_mean": 0.008948912261985242, "clip_ratio/low_mean": 0.004299771288060583, "clip_ratio/low_min": 0.004299771288060583, "clip_ratio/region_mean": 0.013248683593701571, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 300.9375, "completions/mean_terminated_length": 300.9375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.07753410469740629, "epoch": 7.320000292800012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017209012294188142, "kl": 0.006769103230908513, "learning_rate": 7.462979999880627e-06, "loss": 0.0, "num_tokens": 2476345.0, "reward": 0.34925001859664917, "reward_std": 2.4275684356689453, "rewards/rollout_reward_func/mean": 0.34925001859664917, "rewards/rollout_reward_func/std": 2.4275684356689453, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.140625, "sampling/sampling_logp_difference/mean": 0.5178714394569397, "step": 183, "step_time": 10.438457136995567 }, { "clip_ratio/high_max": 0.008047100971452892, "clip_ratio/high_mean": 0.008047100971452892, "clip_ratio/low_mean": 0.004500560404267162, "clip_ratio/low_min": 0.004500560404267162, "clip_ratio/region_mean": 0.012547661317512393, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.07136856066063046, "epoch": 7.360000294400012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010972988093271852, "kl": 0.007577634562039748, "learning_rate": 7.462979999878996e-06, "loss": 0.0, "num_tokens": 2493972.0, "reward": -3.1034374237060547, "reward_std": 3.1368467807769775, "rewards/rollout_reward_func/mean": -3.1034374237060547, "rewards/rollout_reward_func/std": 3.1368467807769775, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.5, "sampling/sampling_logp_difference/mean": 0.5004894137382507, "step": 184, "step_time": 10.513949667998531 }, { "clip_ratio/high_max": 0.006875048333313316, "clip_ratio/high_mean": 0.006875048333313316, "clip_ratio/low_mean": 0.005520216975128278, "clip_ratio/low_min": 0.005520216975128278, "clip_ratio/region_mean": 0.012395265221130103, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 293.125, "completions/mean_terminated_length": 293.125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.07153286039829254, "epoch": 7.4000002960000115e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009016370167955756, "kl": 0.0047642259451095015, "learning_rate": 7.462979999877355e-06, "loss": 0.0, "num_tokens": 2511577.0, "reward": 1.9267499446868896, "reward_std": 9.978171348571777, "rewards/rollout_reward_func/mean": 1.9267499446868896, "rewards/rollout_reward_func/std": 9.978171348571777, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.75, "sampling/sampling_logp_difference/mean": 0.5519725680351257, "step": 185, "step_time": 10.41048597999179 }, { "clip_ratio/high_max": 0.006827581237303093, "clip_ratio/high_mean": 0.006827581237303093, "clip_ratio/low_mean": 0.005499523540493101, "clip_ratio/low_min": 0.005499523540493101, "clip_ratio/region_mean": 0.012327104806900024, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 294.75, "completions/mean_terminated_length": 294.75, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0741916736587882, "epoch": 7.4400002976000115e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007987463613972068, "kl": 0.004326197726186365, "learning_rate": 7.462979999875704e-06, "loss": 0.0, "num_tokens": 2529202.0, "reward": -0.8883124589920044, "reward_std": 7.2901997566223145, "rewards/rollout_reward_func/mean": -0.8883124589920044, "rewards/rollout_reward_func/std": 7.290200710296631, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.0, "sampling/sampling_logp_difference/mean": 0.5034537315368652, "step": 186, "step_time": 10.490018301985401 }, { "clip_ratio/high_max": 0.004316613776609302, "clip_ratio/high_mean": 0.004316613776609302, "clip_ratio/low_mean": 0.006952932424610481, "clip_ratio/low_min": 0.006952932424610481, "clip_ratio/region_mean": 0.01126954611390829, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.07697780709713697, "epoch": 7.480000299200012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008017058135010302, "kl": 0.004453385889064521, "learning_rate": 7.462979999874041e-06, "loss": 0.0, "num_tokens": 2546717.0, "reward": 0.9043751358985901, "reward_std": 7.519944667816162, "rewards/rollout_reward_func/mean": 0.9043751358985901, "rewards/rollout_reward_func/std": 7.51994514465332, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.53125, "sampling/sampling_logp_difference/mean": 0.5145547389984131, "step": 187, "step_time": 10.312752165991697 }, { "clip_ratio/high_max": 0.007390920072793961, "clip_ratio/high_mean": 0.007390920072793961, "clip_ratio/low_mean": 0.004313296260079369, "clip_ratio/low_min": 0.004313296260079369, "clip_ratio/region_mean": 0.011704216478392482, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 278.0625, "completions/mean_terminated_length": 278.0625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.06684054806828499, "epoch": 7.520000300800012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0023888275027275085, "kl": 0.008907061070203781, "learning_rate": 7.462979999872367e-06, "loss": 0.0, "num_tokens": 2564035.0, "reward": -2.0974373817443848, "reward_std": 9.19091510772705, "rewards/rollout_reward_func/mean": -2.0974373817443848, "rewards/rollout_reward_func/std": 9.190916061401367, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.0625, "sampling/sampling_logp_difference/mean": 0.5475496053695679, "step": 188, "step_time": 10.292701802012743 }, { "clip_ratio/high_max": 0.006318859930615872, "clip_ratio/high_mean": 0.006318859930615872, "clip_ratio/low_mean": 0.0054163840832188725, "clip_ratio/low_min": 0.0054163840832188725, "clip_ratio/region_mean": 0.011735243955627084, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 298.375, "completions/mean_terminated_length": 298.375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.07337451446801424, "epoch": 7.560000302400012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013598325895145535, "kl": 0.0061271733429748565, "learning_rate": 7.4629799998706825e-06, "loss": 0.0, "num_tokens": 2581716.0, "reward": -0.4641873836517334, "reward_std": 4.7931647300720215, "rewards/rollout_reward_func/mean": -0.4641873836517334, "rewards/rollout_reward_func/std": 4.79316520690918, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.796878814697266, "sampling/sampling_logp_difference/mean": 0.5163519978523254, "step": 189, "step_time": 10.44404124000721 }, { "clip_ratio/high_max": 0.0036833073245361447, "clip_ratio/high_mean": 0.0036833073245361447, "clip_ratio/low_mean": 0.0085978902206989, "clip_ratio/low_min": 0.0085978902206989, "clip_ratio/region_mean": 0.012281197588890791, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 295.8125, "completions/mean_terminated_length": 295.8125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.07131100306287408, "epoch": 7.600000304000012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010495739988982677, "kl": 0.0049033184186555445, "learning_rate": 7.462979999868987e-06, "loss": 0.0, "num_tokens": 2599361.0, "reward": 3.1274375915527344, "reward_std": 9.621321678161621, "rewards/rollout_reward_func/mean": 3.1274375915527344, "rewards/rollout_reward_func/std": 9.621321678161621, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.375057220458984, "sampling/sampling_logp_difference/mean": 0.5055516362190247, "step": 190, "step_time": 10.410316354995302 }, { "clip_ratio/high_max": 0.005347343772882596, "clip_ratio/high_mean": 0.005347343772882596, "clip_ratio/low_mean": 0.004359221682534553, "clip_ratio/low_min": 0.004359221682534553, "clip_ratio/region_mean": 0.009706565469969064, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 295.5625, "completions/mean_terminated_length": 295.5625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.07299318723380566, "epoch": 7.640000305600012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0006524763884954154, "kl": 0.005766287707956508, "learning_rate": 7.46297999986728e-06, "loss": 0.0, "num_tokens": 2617002.0, "reward": -3.062624931335449, "reward_std": 7.542537212371826, "rewards/rollout_reward_func/mean": -3.062624931335449, "rewards/rollout_reward_func/std": 7.542536735534668, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.6250114440918, "sampling/sampling_logp_difference/mean": 0.504819393157959, "step": 191, "step_time": 10.425486904998252 }, { "clip_ratio/high_max": 0.005329333944246173, "clip_ratio/high_mean": 0.005329333944246173, "clip_ratio/low_mean": 0.007807191810570657, "clip_ratio/low_min": 0.007807191810570657, "clip_ratio/region_mean": 0.01313652575481683, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 304.625, "completions/mean_terminated_length": 304.625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.07030050922185183, "epoch": 7.680000307200013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027000978589057922, "kl": 0.006232147745322436, "learning_rate": 7.462979999865561e-06, "loss": 0.0, "num_tokens": 2634816.0, "reward": 2.9385623931884766, "reward_std": 6.0423126220703125, "rewards/rollout_reward_func/mean": 2.9385623931884766, "rewards/rollout_reward_func/std": 6.0423126220703125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.1875, "sampling/sampling_logp_difference/mean": 0.5247738361358643, "step": 192, "step_time": 10.46146557299653 }, { "clip_ratio/high_max": 0.00847348797833547, "clip_ratio/high_mean": 0.00847348797833547, "clip_ratio/low_mean": 0.005110284429974854, "clip_ratio/low_min": 0.005110284429974854, "clip_ratio/region_mean": 0.013583772233687341, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.07460324559360743, "epoch": 7.720000308800012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007952402229420841, "kl": 0.004423161706654355, "learning_rate": 7.4629799998638325e-06, "loss": 0.0, "num_tokens": 2652261.0, "reward": -3.180874824523926, "reward_std": 5.58429479598999, "rewards/rollout_reward_func/mean": -3.180874824523926, "rewards/rollout_reward_func/std": 5.584295749664307, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.43751525878906, "sampling/sampling_logp_difference/mean": 0.5057220458984375, "step": 193, "step_time": 10.390442552008608 }, { "clip_ratio/high_max": 0.007931458065286279, "clip_ratio/high_mean": 0.007931458065286279, "clip_ratio/low_mean": 0.005814814532641321, "clip_ratio/low_min": 0.005814814532641321, "clip_ratio/region_mean": 0.01374627253971994, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 286.5625, "completions/mean_terminated_length": 286.5625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.0761451655998826, "epoch": 7.760000310400013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012406015302985907, "kl": 0.006234578468138352, "learning_rate": 7.462979999862093e-06, "loss": 0.0, "num_tokens": 2669735.0, "reward": -2.2353127002716064, "reward_std": 9.395085334777832, "rewards/rollout_reward_func/mean": -2.2353127002716064, "rewards/rollout_reward_func/std": 9.395085334777832, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.3125, "sampling/sampling_logp_difference/mean": 0.5128077864646912, "step": 194, "step_time": 10.459394909004914 }, { "clip_ratio/high_max": 0.009617814444936812, "clip_ratio/high_mean": 0.009617814444936812, "clip_ratio/low_mean": 0.003862539102556184, "clip_ratio/low_min": 0.003862539102556184, "clip_ratio/region_mean": 0.013480353634804487, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 291.75, "completions/mean_terminated_length": 291.75, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.07446287712082267, "epoch": 7.800000312000012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.000944193743634969, "kl": 0.005717004300095141, "learning_rate": 7.462979999860342e-06, "loss": 0.0, "num_tokens": 2687313.0, "reward": -3.3571250438690186, "reward_std": 6.334319114685059, "rewards/rollout_reward_func/mean": -3.3571250438690186, "rewards/rollout_reward_func/std": 6.334319114685059, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.890625, "sampling/sampling_logp_difference/mean": 0.5140663385391235, "step": 195, "step_time": 10.36688834099914 }, { "clip_ratio/high_max": 0.009782533365068957, "clip_ratio/high_mean": 0.009782533365068957, "clip_ratio/low_mean": 0.005132559395860881, "clip_ratio/low_min": 0.005132559395860881, "clip_ratio/region_mean": 0.014915092673618346, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 294.5625, "completions/mean_terminated_length": 294.5625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.07086490234360099, "epoch": 7.840000313600013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0006516508874483407, "kl": 0.004234404390444979, "learning_rate": 7.462979999858578e-06, "loss": 0.0, "num_tokens": 2704930.0, "reward": -4.743750095367432, "reward_std": 7.26796817779541, "rewards/rollout_reward_func/mean": -4.743750095367432, "rewards/rollout_reward_func/std": 7.267968654632568, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.1875, "sampling/sampling_logp_difference/mean": 0.5014854669570923, "step": 196, "step_time": 10.425910582009237 }, { "clip_ratio/high_max": 0.007215786725282669, "clip_ratio/high_mean": 0.007215786725282669, "clip_ratio/low_mean": 0.00761663872981444, "clip_ratio/low_min": 0.00761663872981444, "clip_ratio/region_mean": 0.014832425455097109, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 286.0625, "completions/mean_terminated_length": 286.0625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.06913609337061644, "epoch": 7.880000315200012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0006139131146483123, "kl": 0.005461916676722467, "learning_rate": 7.4629799998568055e-06, "loss": 0.0, "num_tokens": 2722392.0, "reward": 1.5505625009536743, "reward_std": 9.03647518157959, "rewards/rollout_reward_func/mean": 1.5505625009536743, "rewards/rollout_reward_func/std": 9.036476135253906, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.0, "sampling/sampling_logp_difference/mean": 0.5393673777580261, "step": 197, "step_time": 10.394044206994295 }, { "clip_ratio/high_max": 0.007037784205749631, "clip_ratio/high_mean": 0.007037784205749631, "clip_ratio/low_mean": 0.007735451392363757, "clip_ratio/low_min": 0.007735451392363757, "clip_ratio/region_mean": 0.014773235539905727, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.06769020203500986, "epoch": 7.920000316800013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026472776662558317, "kl": 0.009624648344470188, "learning_rate": 7.462979999855022e-06, "loss": 0.0, "num_tokens": 2739967.0, "reward": 2.2893126010894775, "reward_std": 9.238142967224121, "rewards/rollout_reward_func/mean": 2.2893126010894775, "rewards/rollout_reward_func/std": 9.238142967224121, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.15625, "sampling/sampling_logp_difference/mean": 0.5161686539649963, "step": 198, "step_time": 10.430668625005637 }, { "clip_ratio/high_max": 0.00643123232293874, "clip_ratio/high_mean": 0.00643123232293874, "clip_ratio/low_mean": 0.009253874683054164, "clip_ratio/low_min": 0.009253874683054164, "clip_ratio/region_mean": 0.015685106976889074, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.07331555848941207, "epoch": 7.960000318400014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007661830168217421, "kl": 0.005430422141216695, "learning_rate": 7.462979999853226e-06, "loss": 0.0, "num_tokens": 2757574.0, "reward": 1.5758750438690186, "reward_std": 7.998481750488281, "rewards/rollout_reward_func/mean": 1.5758750438690186, "rewards/rollout_reward_func/std": 7.998481750488281, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.25, "sampling/sampling_logp_difference/mean": 0.48864784836769104, "step": 199, "step_time": 10.368442829989363 }, { "clip_ratio/high_max": 0.007758725201711059, "clip_ratio/high_mean": 0.007758725201711059, "clip_ratio/low_mean": 0.005943318392382935, "clip_ratio/low_min": 0.005943318392382935, "clip_ratio/region_mean": 0.013702043768716976, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 294.0625, "completions/mean_terminated_length": 294.0625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.06773880636319518, "epoch": 8.000000320000013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007176241488195956, "kl": 0.007380193506833166, "learning_rate": 7.462979999851419e-06, "loss": 0.0, "num_tokens": 2775165.0, "reward": -3.3624372482299805, "reward_std": 4.229238033294678, "rewards/rollout_reward_func/mean": -3.3624372482299805, "rewards/rollout_reward_func/std": 4.229238510131836, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.84375, "sampling/sampling_logp_difference/mean": 0.5226989984512329, "step": 200, "step_time": 10.373237561005226 }, { "clip_ratio/high_max": 0.00716740608913824, "clip_ratio/high_mean": 0.00716740608913824, "clip_ratio/low_mean": 0.007129900739528239, "clip_ratio/low_min": 0.007129900739528239, "clip_ratio/region_mean": 0.014297306770458817, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 296.4375, "completions/mean_terminated_length": 296.4375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.0676097390241921, "epoch": 8.040000321600013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.00103105534799397, "kl": 0.005847204738529399, "learning_rate": 7.4629799998496015e-06, "loss": 0.0, "num_tokens": 2792834.0, "reward": -3.079812526702881, "reward_std": 5.178314685821533, "rewards/rollout_reward_func/mean": -3.079812526702881, "rewards/rollout_reward_func/std": 5.178315162658691, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.1875, "sampling/sampling_logp_difference/mean": 0.5041525959968567, "step": 201, "step_time": 10.456045853992691 }, { "clip_ratio/high_max": 0.006707881693728268, "clip_ratio/high_mean": 0.006707881693728268, "clip_ratio/low_mean": 0.004892625147476792, "clip_ratio/low_min": 0.004892625147476792, "clip_ratio/region_mean": 0.0116005067829974, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 297.1875, "completions/mean_terminated_length": 297.1875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.06414524186402559, "epoch": 8.080000323200013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024494952522218227, "kl": 0.007977387111168355, "learning_rate": 7.462979999847774e-06, "loss": 0.0, "num_tokens": 2810516.0, "reward": -4.479562282562256, "reward_std": 8.388687133789062, "rewards/rollout_reward_func/mean": -4.479562282562256, "rewards/rollout_reward_func/std": 8.388687133789062, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.8125, "sampling/sampling_logp_difference/mean": 0.49383965134620667, "step": 202, "step_time": 10.521077334997244 }, { "clip_ratio/high_max": 0.0048702006170060486, "clip_ratio/high_mean": 0.0048702006170060486, "clip_ratio/low_mean": 0.003835549025097862, "clip_ratio/low_min": 0.003835549025097862, "clip_ratio/region_mean": 0.008705749598448165, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.07122470624744892, "epoch": 8.120000324800013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0006830418133176863, "kl": 0.007118333887774497, "learning_rate": 7.462979999845935e-06, "loss": 0.0, "num_tokens": 2828098.0, "reward": -2.215437412261963, "reward_std": 5.841372966766357, "rewards/rollout_reward_func/mean": -2.215437412261963, "rewards/rollout_reward_func/std": 5.841372966766357, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.33984375, "sampling/sampling_logp_difference/mean": 0.5098539590835571, "step": 203, "step_time": 10.42646260800393 }, { "clip_ratio/high_max": 0.007122764131054282, "clip_ratio/high_mean": 0.007122764131054282, "clip_ratio/low_mean": 0.0038379052857635543, "clip_ratio/low_min": 0.0038379052857635543, "clip_ratio/region_mean": 0.01096066937316209, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 279.0625, "completions/mean_terminated_length": 279.0625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.068637958727777, "epoch": 8.160000326400013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013256366364657879, "kl": 0.005409070960013196, "learning_rate": 7.462979999844084e-06, "loss": 0.0, "num_tokens": 2845437.0, "reward": -1.9845623970031738, "reward_std": 4.301019191741943, "rewards/rollout_reward_func/mean": -1.9845623970031738, "rewards/rollout_reward_func/std": 4.301019668579102, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.15625, "sampling/sampling_logp_difference/mean": 0.5228663086891174, "step": 204, "step_time": 10.343530118007038 }, { "clip_ratio/high_max": 0.004984182014595717, "clip_ratio/high_mean": 0.004984182014595717, "clip_ratio/low_mean": 0.005790864699520171, "clip_ratio/low_min": 0.005790864699520171, "clip_ratio/region_mean": 0.010775046714115888, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 302.125, "completions/mean_terminated_length": 302.125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.07109927712008357, "epoch": 8.200000328000013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012679125647991896, "kl": 0.006134924551588483, "learning_rate": 7.462979999842222e-06, "loss": 0.0, "num_tokens": 2863189.0, "reward": -0.12943744659423828, "reward_std": 4.749524116516113, "rewards/rollout_reward_func/mean": -0.12943744659423828, "rewards/rollout_reward_func/std": 4.749523639678955, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.703125, "sampling/sampling_logp_difference/mean": 0.5272496342658997, "step": 205, "step_time": 10.485789341000782 }, { "clip_ratio/high_max": 0.00403137925604824, "clip_ratio/high_mean": 0.00403137925604824, "clip_ratio/low_mean": 0.00872997718397528, "clip_ratio/low_min": 0.00872997718397528, "clip_ratio/region_mean": 0.012761356425471604, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 294.875, "completions/mean_terminated_length": 294.875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.0743389492854476, "epoch": 8.240000329600012e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008070362382568419, "kl": 0.006990123802097514, "learning_rate": 7.4629799998403485e-06, "loss": 0.0, "num_tokens": 2880813.0, "reward": 5.090437412261963, "reward_std": 26.208789825439453, "rewards/rollout_reward_func/mean": 5.090437412261963, "rewards/rollout_reward_func/std": 26.20879364013672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.5, "sampling/sampling_logp_difference/mean": 0.5036377906799316, "step": 206, "step_time": 10.409622582017619 }, { "clip_ratio/high_max": 0.0072547850431874394, "clip_ratio/high_mean": 0.0072547850431874394, "clip_ratio/low_mean": 0.007388435682514682, "clip_ratio/low_min": 0.007388435682514682, "clip_ratio/region_mean": 0.014643220696598291, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 285.6875, "completions/mean_terminated_length": 285.6875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.06925734551623464, "epoch": 8.280000331200013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.000623320578597486, "kl": 0.006224569398909807, "learning_rate": 7.462979999838465e-06, "loss": 0.0, "num_tokens": 2898262.0, "reward": -0.690000057220459, "reward_std": 7.10194206237793, "rewards/rollout_reward_func/mean": -0.690000057220459, "rewards/rollout_reward_func/std": 7.101943016052246, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.28128433227539, "sampling/sampling_logp_difference/mean": 0.5192135572433472, "step": 207, "step_time": 10.343538086999615 }, { "clip_ratio/high_max": 0.007540311780758202, "clip_ratio/high_mean": 0.007540311780758202, "clip_ratio/low_mean": 0.0067114136181771755, "clip_ratio/low_min": 0.0067114136181771755, "clip_ratio/region_mean": 0.0142517255153507, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 298.8125, "completions/mean_terminated_length": 298.8125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.07657580729573965, "epoch": 8.320000332800014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017324785003438592, "kl": 0.005487348447786644, "learning_rate": 7.46297999983657e-06, "loss": 0.0, "num_tokens": 2915954.0, "reward": -3.106062412261963, "reward_std": 5.189664363861084, "rewards/rollout_reward_func/mean": -3.106062412261963, "rewards/rollout_reward_func/std": 5.189664840698242, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.25, "sampling/sampling_logp_difference/mean": 0.5135565996170044, "step": 208, "step_time": 10.470864311995683 }, { "clip_ratio/high_max": 0.0052483935141935945, "clip_ratio/high_mean": 0.0052483935141935945, "clip_ratio/low_mean": 0.005675289809005335, "clip_ratio/low_min": 0.005675289809005335, "clip_ratio/region_mean": 0.01092368335230276, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 291.75, "completions/mean_terminated_length": 291.75, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.07329797092825174, "epoch": 8.360000334400013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011501156259328127, "kl": 0.005376071610953659, "learning_rate": 7.462979999834665e-06, "loss": 0.0, "num_tokens": 2933512.0, "reward": 3.5945000648498535, "reward_std": 26.457881927490234, "rewards/rollout_reward_func/mean": 3.5945000648498535, "rewards/rollout_reward_func/std": 26.457883834838867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.6875, "sampling/sampling_logp_difference/mean": 0.48745647072792053, "step": 209, "step_time": 10.40872379999928 }, { "clip_ratio/high_max": 0.004135372815653682, "clip_ratio/high_mean": 0.004135372815653682, "clip_ratio/low_mean": 0.007877453230321407, "clip_ratio/low_min": 0.007877453230321407, "clip_ratio/region_mean": 0.012012826045975089, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 290.6875, "completions/mean_terminated_length": 290.6875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.07768004480749369, "epoch": 8.400000336000014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008427107823081315, "kl": 0.005113016290124506, "learning_rate": 7.462979999832747e-06, "loss": 0.0, "num_tokens": 2951066.0, "reward": 0.40018749237060547, "reward_std": 4.2269816398620605, "rewards/rollout_reward_func/mean": 0.40018749237060547, "rewards/rollout_reward_func/std": 4.226982116699219, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.0, "sampling/sampling_logp_difference/mean": 0.5105453729629517, "step": 210, "step_time": 10.338383365997288 }, { "clip_ratio/high_max": 0.00821595371235162, "clip_ratio/high_mean": 0.00821595371235162, "clip_ratio/low_mean": 0.006136946787592024, "clip_ratio/low_min": 0.006136946787592024, "clip_ratio/region_mean": 0.014352900441735983, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 304.5, "completions/mean_terminated_length": 304.5, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.07289551943540573, "epoch": 8.440000337600013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011316367890685797, "kl": 0.005552825052291155, "learning_rate": 7.462979999830819e-06, "loss": 0.0, "num_tokens": 2968890.0, "reward": -2.5518124103546143, "reward_std": 10.742227554321289, "rewards/rollout_reward_func/mean": -2.5518124103546143, "rewards/rollout_reward_func/std": 10.742227554321289, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.3125, "sampling/sampling_logp_difference/mean": 0.49785053730010986, "step": 211, "step_time": 10.350022268008615 }, { "clip_ratio/high_max": 0.007911017222795635, "clip_ratio/high_mean": 0.007911017222795635, "clip_ratio/low_mean": 0.004722237150417641, "clip_ratio/low_min": 0.004722237150417641, "clip_ratio/region_mean": 0.012633254285901785, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 294.3125, "completions/mean_terminated_length": 294.3125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.07099650474265218, "epoch": 8.480000339200014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.04617423936724663, "kl": 0.034373199014225975, "learning_rate": 7.46297999982888e-06, "loss": 0.0001, "num_tokens": 2986507.0, "reward": -1.2528750896453857, "reward_std": 2.3789212703704834, "rewards/rollout_reward_func/mean": -1.2528750896453857, "rewards/rollout_reward_func/std": 2.3789212703704834, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.125, "sampling/sampling_logp_difference/mean": 0.5280681252479553, "step": 212, "step_time": 10.39342241500708 }, { "clip_ratio/high_max": 0.007163203554227948, "clip_ratio/high_mean": 0.007163203554227948, "clip_ratio/low_mean": 0.005455520236864686, "clip_ratio/low_min": 0.005455520236864686, "clip_ratio/region_mean": 0.012618723849300295, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 286.8125, "completions/mean_terminated_length": 286.8125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.07894612662494183, "epoch": 8.520000340800013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007902010111138225, "kl": 0.005645010911393911, "learning_rate": 7.46297999982693e-06, "loss": 0.0, "num_tokens": 3003986.0, "reward": 0.16543757915496826, "reward_std": 8.710289001464844, "rewards/rollout_reward_func/mean": 0.16543757915496826, "rewards/rollout_reward_func/std": 8.71028995513916, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.65625, "sampling/sampling_logp_difference/mean": 0.5372638702392578, "step": 213, "step_time": 10.426365067003644 }, { "clip_ratio/high_max": 0.006808929669205099, "clip_ratio/high_mean": 0.006808929669205099, "clip_ratio/low_mean": 0.006120394304161891, "clip_ratio/low_min": 0.006120394304161891, "clip_ratio/region_mean": 0.012929323827847838, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 275.8125, "completions/mean_terminated_length": 275.8125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.0752148199826479, "epoch": 8.560000342400014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.00105961412191391, "kl": 0.005990634876070544, "learning_rate": 7.46297999982497e-06, "loss": 0.0, "num_tokens": 3021270.0, "reward": 6.443249702453613, "reward_std": 22.919885635375977, "rewards/rollout_reward_func/mean": 6.443249702453613, "rewards/rollout_reward_func/std": 22.919885635375977, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.984375, "sampling/sampling_logp_difference/mean": 0.5495111346244812, "step": 214, "step_time": 10.289786412999092 }, { "clip_ratio/high_max": 0.0032314254785887897, "clip_ratio/high_mean": 0.0032314254785887897, "clip_ratio/low_mean": 0.00678424775833264, "clip_ratio/low_min": 0.00678424775833264, "clip_ratio/region_mean": 0.0100156732078176, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 286.875, "completions/mean_terminated_length": 286.875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.07191575784236193, "epoch": 8.600000344000013e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.001296165632084012, "kl": 0.008857668552082032, "learning_rate": 7.462979999822997e-06, "loss": 0.0, "num_tokens": 3038753.0, "reward": 4.606249809265137, "reward_std": 24.46164894104004, "rewards/rollout_reward_func/mean": 4.606249809265137, "rewards/rollout_reward_func/std": 24.461652755737305, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.90625, "sampling/sampling_logp_difference/mean": 0.531778872013092, "step": 215, "step_time": 10.41431663397816 }, { "clip_ratio/high_max": 0.004112839815206826, "clip_ratio/high_mean": 0.004112839815206826, "clip_ratio/low_mean": 0.0067651880090124905, "clip_ratio/low_min": 0.0067651880090124905, "clip_ratio/region_mean": 0.010878027824219316, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 288.0625, "completions/mean_terminated_length": 288.0625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.07259572204202414, "epoch": 8.640000345600014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010067387484014034, "kl": 0.008078172482782975, "learning_rate": 7.462979999821014e-06, "loss": 0.0, "num_tokens": 3056251.0, "reward": 4.470749378204346, "reward_std": 25.280162811279297, "rewards/rollout_reward_func/mean": 4.470749378204346, "rewards/rollout_reward_func/std": 25.28015899658203, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.796875, "sampling/sampling_logp_difference/mean": 0.528924822807312, "step": 216, "step_time": 10.365210538999236 }, { "clip_ratio/high_max": 0.007389905018499121, "clip_ratio/high_mean": 0.007389905018499121, "clip_ratio/low_mean": 0.004879763990174979, "clip_ratio/low_min": 0.004879763990174979, "clip_ratio/region_mean": 0.012269669212400913, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.07718994887545705, "epoch": 8.680000347200015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014796305913478136, "kl": 0.009286250889999792, "learning_rate": 7.462979999819019e-06, "loss": 0.0, "num_tokens": 3073809.0, "reward": -0.07100000977516174, "reward_std": 1.9657223224639893, "rewards/rollout_reward_func/mean": -0.07100000977516174, "rewards/rollout_reward_func/std": 1.9657223224639893, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.125164031982422, "sampling/sampling_logp_difference/mean": 0.4872804880142212, "step": 217, "step_time": 10.490648301994952 }, { "clip_ratio/high_max": 0.006382640975061804, "clip_ratio/high_mean": 0.006382640975061804, "clip_ratio/low_mean": 0.006465274782385677, "clip_ratio/low_min": 0.006465274782385677, "clip_ratio/region_mean": 0.01284791564103216, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 292.4375, "completions/mean_terminated_length": 292.4375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.07658532913774252, "epoch": 8.720000348800014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009322648402303457, "kl": 0.0066098653769586235, "learning_rate": 7.4629799998170144e-06, "loss": 0.0, "num_tokens": 3091403.0, "reward": 5.070812225341797, "reward_std": 19.37571907043457, "rewards/rollout_reward_func/mean": 5.070812225341797, "rewards/rollout_reward_func/std": 19.375720977783203, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.8125, "sampling/sampling_logp_difference/mean": 0.5114758014678955, "step": 218, "step_time": 10.409408122999594 }, { "clip_ratio/high_max": 0.006274793500779197, "clip_ratio/high_mean": 0.006274793500779197, "clip_ratio/low_mean": 0.0035967174189863726, "clip_ratio/low_min": 0.0035967174189863726, "clip_ratio/region_mean": 0.009871510788798332, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 291.75, "completions/mean_terminated_length": 291.75, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.07502627186477184, "epoch": 8.760000350400014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009974229615181684, "kl": 0.008301462657982484, "learning_rate": 7.462979999814998e-06, "loss": 0.0, "num_tokens": 3108979.0, "reward": -0.5715000629425049, "reward_std": 7.991994857788086, "rewards/rollout_reward_func/mean": -0.5715000629425049, "rewards/rollout_reward_func/std": 7.991994857788086, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.21875, "sampling/sampling_logp_difference/mean": 0.4979334771633148, "step": 219, "step_time": 10.441937400006282 }, { "clip_ratio/high_max": 0.0070088920765556395, "clip_ratio/high_mean": 0.0070088920765556395, "clip_ratio/low_mean": 0.005253318333416246, "clip_ratio/low_min": 0.005253318333416246, "clip_ratio/region_mean": 0.012262210308108479, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 287.9375, "completions/mean_terminated_length": 287.9375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.07160999719053507, "epoch": 8.800000352000014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008315242594107985, "kl": 0.008744841732550412, "learning_rate": 7.462979999812971e-06, "loss": 0.0, "num_tokens": 3126482.0, "reward": -0.9008749723434448, "reward_std": 6.923779010772705, "rewards/rollout_reward_func/mean": -0.9008749723434448, "rewards/rollout_reward_func/std": 6.923779487609863, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.250064849853516, "sampling/sampling_logp_difference/mean": 0.5067073702812195, "step": 220, "step_time": 10.422230709009455 }, { "clip_ratio/high_max": 0.006966194428969175, "clip_ratio/high_mean": 0.006966194428969175, "clip_ratio/low_mean": 0.0036773718748008832, "clip_ratio/low_min": 0.0036773718748008832, "clip_ratio/region_mean": 0.010643566434737295, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 306.1875, "completions/mean_terminated_length": 306.1875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.07323083095252514, "epoch": 8.840000353600014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0005752131110057235, "kl": 0.0060841158556286246, "learning_rate": 7.462979999810932e-06, "loss": 0.0, "num_tokens": 3144329.0, "reward": -2.7224373817443848, "reward_std": 4.259339332580566, "rewards/rollout_reward_func/mean": -2.7224373817443848, "rewards/rollout_reward_func/std": 4.259339332580566, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.15625, "sampling/sampling_logp_difference/mean": 0.4907236695289612, "step": 221, "step_time": 10.39906555799098 }, { "clip_ratio/high_max": 0.006185467485920526, "clip_ratio/high_mean": 0.006185467485920526, "clip_ratio/low_mean": 0.007484167406801134, "clip_ratio/low_min": 0.007484167406801134, "clip_ratio/region_mean": 0.013669634761754423, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 285.1875, "completions/mean_terminated_length": 285.1875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.0745758842676878, "epoch": 8.880000355200014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007806437206454575, "kl": 0.009217363112838939, "learning_rate": 7.462979999808883e-06, "loss": 0.0, "num_tokens": 3161783.0, "reward": 6.308187484741211, "reward_std": 22.240018844604492, "rewards/rollout_reward_func/mean": 6.308187484741211, "rewards/rollout_reward_func/std": 22.240018844604492, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.9375, "sampling/sampling_logp_difference/mean": 0.49560806155204773, "step": 222, "step_time": 10.510315344996343 }, { "clip_ratio/high_max": 0.008099342521745712, "clip_ratio/high_mean": 0.008099342521745712, "clip_ratio/low_mean": 0.005552220362005755, "clip_ratio/low_min": 0.005552220362005755, "clip_ratio/region_mean": 0.013651562621816993, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 293.625, "completions/mean_terminated_length": 293.625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.07354083377867937, "epoch": 8.920000356800014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034127256367355585, "kl": 0.010761091369204223, "learning_rate": 7.462979999806822e-06, "loss": 0.0, "num_tokens": 3179390.0, "reward": -0.35224997997283936, "reward_std": 15.819132804870605, "rewards/rollout_reward_func/mean": -0.35224997997283936, "rewards/rollout_reward_func/std": 15.819132804870605, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.375, "sampling/sampling_logp_difference/mean": 0.48868516087532043, "step": 223, "step_time": 10.460384486999828 }, { "clip_ratio/high_max": 0.0032605971209704876, "clip_ratio/high_mean": 0.0032605971209704876, "clip_ratio/low_mean": 0.006037081562681124, "clip_ratio/low_min": 0.006037081562681124, "clip_ratio/region_mean": 0.00929767859634012, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.07803612668067217, "epoch": 8.960000358400015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0005930400802753866, "kl": 0.0051369365537539124, "learning_rate": 7.46297999980475e-06, "loss": 0.0, "num_tokens": 3196897.0, "reward": 6.539249897003174, "reward_std": 25.11175537109375, "rewards/rollout_reward_func/mean": 6.539249897003174, "rewards/rollout_reward_func/std": 25.111753463745117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 26.75002670288086, "sampling/sampling_logp_difference/mean": 0.4839804768562317, "step": 224, "step_time": 10.482437264981854 }, { "clip_ratio/high_max": 0.002772088613710366, "clip_ratio/high_mean": 0.002772088613710366, "clip_ratio/low_mean": 0.007232827658299357, "clip_ratio/low_min": 0.007232827658299357, "clip_ratio/region_mean": 0.010004916344769299, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 299.625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.06949639366939664, "epoch": 9.000000360000014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017324341461062431, "kl": 0.012604300951352343, "learning_rate": 7.462979999802668e-06, "loss": 0.0, "num_tokens": 3214611.0, "reward": 6.1917500495910645, "reward_std": 26.303720474243164, "rewards/rollout_reward_func/mean": 6.1917500495910645, "rewards/rollout_reward_func/std": 26.303720474243164, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.062522888183594, "sampling/sampling_logp_difference/mean": 0.5087279081344604, "step": 225, "step_time": 10.426214112994785 }, { "clip_ratio/high_max": 0.005798804195364937, "clip_ratio/high_mean": 0.005798804195364937, "clip_ratio/low_mean": 0.005987404801999219, "clip_ratio/low_min": 0.005987404801999219, "clip_ratio/region_mean": 0.011786208779085428, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07354439236223698, "epoch": 9.040000361600015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0019116246839985251, "kl": 0.008319076878251508, "learning_rate": 7.462979999800574e-06, "loss": 0.0, "num_tokens": 3232022.0, "reward": -2.6448750495910645, "reward_std": 9.265175819396973, "rewards/rollout_reward_func/mean": -2.6448750495910645, "rewards/rollout_reward_func/std": 9.265175819396973, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.0625, "sampling/sampling_logp_difference/mean": 0.5094653964042664, "step": 226, "step_time": 10.404956913997012 }, { "clip_ratio/high_max": 0.005037394352257252, "clip_ratio/high_mean": 0.005037394352257252, "clip_ratio/low_mean": 0.00541810889262706, "clip_ratio/low_min": 0.00541810889262706, "clip_ratio/region_mean": 0.010455503361299634, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 300.25, "completions/mean_terminated_length": 300.25, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.0709527526050806, "epoch": 9.080000363200014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030708452686667442, "kl": 0.011681635631248355, "learning_rate": 7.462979999798469e-06, "loss": 0.0, "num_tokens": 3249750.0, "reward": 9.407562255859375, "reward_std": 31.90032386779785, "rewards/rollout_reward_func/mean": 9.407562255859375, "rewards/rollout_reward_func/std": 31.90032386779785, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.421875, "sampling/sampling_logp_difference/mean": 0.5103008151054382, "step": 227, "step_time": 10.49286517699511 }, { "clip_ratio/high_max": 0.006486930418759584, "clip_ratio/high_mean": 0.006486930418759584, "clip_ratio/low_mean": 0.005872974172234535, "clip_ratio/low_min": 0.005872974172234535, "clip_ratio/region_mean": 0.01235990459099412, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 295.6875, "completions/mean_terminated_length": 295.6875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.07401463389396667, "epoch": 9.120000364800015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009173924336209893, "kl": 0.009242938423994929, "learning_rate": 7.462979999796354e-06, "loss": 0.0, "num_tokens": 3267394.0, "reward": -2.740062713623047, "reward_std": 8.185542106628418, "rewards/rollout_reward_func/mean": -2.740062713623047, "rewards/rollout_reward_func/std": 8.185543060302734, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.125, "sampling/sampling_logp_difference/mean": 0.5013666749000549, "step": 228, "step_time": 10.459271728992462 }, { "clip_ratio/high_max": 0.0045171317178756, "clip_ratio/high_mean": 0.0045171317178756, "clip_ratio/low_mean": 0.008404601889196783, "clip_ratio/low_min": 0.008404601889196783, "clip_ratio/region_mean": 0.012921733723487705, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.06940495269373059, "epoch": 9.160000366400014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008395009790547192, "kl": 0.007896699447883293, "learning_rate": 7.462979999794227e-06, "loss": 0.0, "num_tokens": 3284837.0, "reward": -0.6620000004768372, "reward_std": 7.491186618804932, "rewards/rollout_reward_func/mean": -0.6620000004768372, "rewards/rollout_reward_func/std": 7.491187572479248, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.359375, "sampling/sampling_logp_difference/mean": 0.5254080295562744, "step": 229, "step_time": 10.449522949005768 }, { "clip_ratio/high_max": 0.0069802088546566665, "clip_ratio/high_mean": 0.0069802088546566665, "clip_ratio/low_mean": 0.004850742145208642, "clip_ratio/low_min": 0.004850742145208642, "clip_ratio/region_mean": 0.0118309510871768, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 285.1875, "completions/mean_terminated_length": 285.1875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.07008095132187009, "epoch": 9.200000368000015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008125026943162084, "kl": 0.006478730298113078, "learning_rate": 7.462979999792089e-06, "loss": 0.0, "num_tokens": 3302271.0, "reward": -0.6057499051094055, "reward_std": 9.229299545288086, "rewards/rollout_reward_func/mean": -0.6057499051094055, "rewards/rollout_reward_func/std": 9.229299545288086, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.65625, "sampling/sampling_logp_difference/mean": 0.5465345978736877, "step": 230, "step_time": 10.466248587996233 }, { "clip_ratio/high_max": 0.002579241569037549, "clip_ratio/high_mean": 0.002579241569037549, "clip_ratio/low_mean": 0.00872062350390479, "clip_ratio/low_min": 0.00872062350390479, "clip_ratio/region_mean": 0.011299864971078932, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.0716686206869781, "epoch": 9.240000369600014e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009823498548939824, "kl": 0.007057618582621217, "learning_rate": 7.46297999978994e-06, "loss": 0.0, "num_tokens": 3319772.0, "reward": 5.367499828338623, "reward_std": 22.293678283691406, "rewards/rollout_reward_func/mean": 5.367499828338623, "rewards/rollout_reward_func/std": 22.293678283691406, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.578125, "sampling/sampling_logp_difference/mean": 0.5200445652008057, "step": 231, "step_time": 10.530433485997492 }, { "clip_ratio/high_max": 0.005123450013343245, "clip_ratio/high_mean": 0.005123450013343245, "clip_ratio/low_mean": 0.004729689579107799, "clip_ratio/low_min": 0.004729689579107799, "clip_ratio/region_mean": 0.009853139636106789, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 292.4375, "completions/mean_terminated_length": 292.4375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.07078778650611639, "epoch": 9.280000371200015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0006170250126160681, "kl": 0.005615973903331906, "learning_rate": 7.46297999978778e-06, "loss": 0.0, "num_tokens": 3337353.0, "reward": 2.8162500858306885, "reward_std": 24.549152374267578, "rewards/rollout_reward_func/mean": 2.8162500858306885, "rewards/rollout_reward_func/std": 24.54915428161621, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.5625, "sampling/sampling_logp_difference/mean": 0.5024914145469666, "step": 232, "step_time": 10.511879109013535 }, { "clip_ratio/high_max": 0.007292784983292222, "clip_ratio/high_mean": 0.007292784983292222, "clip_ratio/low_mean": 0.006174966809339821, "clip_ratio/low_min": 0.006174966809339821, "clip_ratio/region_mean": 0.013467751909047365, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 287.0625, "completions/mean_terminated_length": 287.0625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.0764157073572278, "epoch": 9.320000372800016e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012183890212327242, "kl": 0.009588233137037605, "learning_rate": 7.462979999785609e-06, "loss": 0.0, "num_tokens": 3354825.0, "reward": 1.3982502222061157, "reward_std": 8.671204566955566, "rewards/rollout_reward_func/mean": 1.3982502222061157, "rewards/rollout_reward_func/std": 8.671204566955566, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.75, "sampling/sampling_logp_difference/mean": 0.5325564742088318, "step": 233, "step_time": 10.439244205997966 }, { "clip_ratio/high_max": 0.006610548734897748, "clip_ratio/high_mean": 0.006610548734897748, "clip_ratio/low_mean": 0.006613574631046504, "clip_ratio/low_min": 0.006613574631046504, "clip_ratio/region_mean": 0.01322412327863276, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 283.4375, "completions/mean_terminated_length": 283.4375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.07522009499371052, "epoch": 9.360000374400015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007153196493163705, "kl": 0.0063078636303544044, "learning_rate": 7.462979999783426e-06, "loss": 0.0, "num_tokens": 3372239.0, "reward": -0.21200013160705566, "reward_std": 10.733263969421387, "rewards/rollout_reward_func/mean": -0.21200013160705566, "rewards/rollout_reward_func/std": 10.733263969421387, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.21875, "sampling/sampling_logp_difference/mean": 0.5257741808891296, "step": 234, "step_time": 10.356595102006395 }, { "clip_ratio/high_max": 0.007157859276048839, "clip_ratio/high_mean": 0.007157859276048839, "clip_ratio/low_mean": 0.003437432795180939, "clip_ratio/low_min": 0.003437432795180939, "clip_ratio/region_mean": 0.010595292173093185, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 293.75, "completions/mean_terminated_length": 293.75, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.07259562332183123, "epoch": 9.400000376000016e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008861714159138501, "kl": 0.007516657700762153, "learning_rate": 7.462979999781233e-06, "loss": 0.0, "num_tokens": 3389861.0, "reward": 3.9145002365112305, "reward_std": 24.87602996826172, "rewards/rollout_reward_func/mean": 3.9145002365112305, "rewards/rollout_reward_func/std": 24.87602996826172, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.6875, "sampling/sampling_logp_difference/mean": 0.5075604319572449, "step": 235, "step_time": 10.50964647600631 }, { "clip_ratio/high_max": 0.009263997460948303, "clip_ratio/high_mean": 0.009263997460948303, "clip_ratio/low_mean": 0.004536461376119405, "clip_ratio/low_min": 0.004536461376119405, "clip_ratio/region_mean": 0.013800458575133234, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 286.0625, "completions/mean_terminated_length": 286.0625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.07244259584695101, "epoch": 9.440000377600015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008124469895847142, "kl": 0.007635855436092243, "learning_rate": 7.46297999977903e-06, "loss": 0.0, "num_tokens": 3407324.0, "reward": -1.1889375448226929, "reward_std": 9.304420471191406, "rewards/rollout_reward_func/mean": -1.1889375448226929, "rewards/rollout_reward_func/std": 9.304420471191406, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.625, "sampling/sampling_logp_difference/mean": 0.5102166533470154, "step": 236, "step_time": 10.38328426201042 }, { "clip_ratio/high_max": 0.006655705903540365, "clip_ratio/high_mean": 0.006655705903540365, "clip_ratio/low_mean": 0.004969383939169347, "clip_ratio/low_min": 0.004969383939169347, "clip_ratio/region_mean": 0.011625089857261628, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 280.5625, "completions/mean_terminated_length": 280.5625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.0703823328949511, "epoch": 9.480000379200015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012428844347596169, "kl": 0.007022345322184265, "learning_rate": 7.462979999776814e-06, "loss": 0.0, "num_tokens": 3424673.0, "reward": -1.6115624904632568, "reward_std": 2.095017671585083, "rewards/rollout_reward_func/mean": -1.6115624904632568, "rewards/rollout_reward_func/std": 2.095017671585083, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.59375, "sampling/sampling_logp_difference/mean": 0.553109884262085, "step": 237, "step_time": 10.385066492999613 }, { "clip_ratio/high_max": 0.0062781854649074376, "clip_ratio/high_mean": 0.0062781854649074376, "clip_ratio/low_mean": 0.00677502085454762, "clip_ratio/low_min": 0.00677502085454762, "clip_ratio/region_mean": 0.013053206377662718, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.07437111111357808, "epoch": 9.520000380800015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009913143003359437, "kl": 0.006143612088635564, "learning_rate": 7.462979999774587e-06, "loss": 0.0, "num_tokens": 3442197.0, "reward": 3.161062240600586, "reward_std": 25.236520767211914, "rewards/rollout_reward_func/mean": 3.161062240600586, "rewards/rollout_reward_func/std": 25.236520767211914, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.09375, "sampling/sampling_logp_difference/mean": 0.5063914060592651, "step": 238, "step_time": 10.418692467996152 }, { "clip_ratio/high_max": 0.007481343811377883, "clip_ratio/high_mean": 0.007481343811377883, "clip_ratio/low_mean": 0.008446184219792485, "clip_ratio/low_min": 0.008446184219792485, "clip_ratio/region_mean": 0.01592752814758569, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 282.75, "completions/mean_terminated_length": 282.75, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.07956357765942812, "epoch": 9.560000382400015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026051048189401627, "kl": 0.008892162120901048, "learning_rate": 7.46297999977235e-06, "loss": 0.0, "num_tokens": 3459595.0, "reward": -0.4233750104904175, "reward_std": 2.6685330867767334, "rewards/rollout_reward_func/mean": -0.4233750104904175, "rewards/rollout_reward_func/std": 2.6685333251953125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.25, "sampling/sampling_logp_difference/mean": 0.5284103155136108, "step": 239, "step_time": 10.30005260600592 }, { "clip_ratio/high_max": 0.005940628732787445, "clip_ratio/high_mean": 0.005940628732787445, "clip_ratio/low_mean": 0.005964512762147933, "clip_ratio/low_min": 0.005964512762147933, "clip_ratio/region_mean": 0.011905141407623887, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.07516445871442556, "epoch": 9.600000384000016e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008605712209828198, "kl": 0.008138176024658605, "learning_rate": 7.462979999770102e-06, "loss": 0.0, "num_tokens": 3477000.0, "reward": -0.5318750143051147, "reward_std": 2.7277369499206543, "rewards/rollout_reward_func/mean": -0.5318750143051147, "rewards/rollout_reward_func/std": 2.727736711502075, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.437541961669922, "sampling/sampling_logp_difference/mean": 0.5171365141868591, "step": 240, "step_time": 10.28705909299606 }, { "clip_ratio/high_max": 0.0073351055034436285, "clip_ratio/high_mean": 0.0073351055034436285, "clip_ratio/low_mean": 0.005415727209765464, "clip_ratio/low_min": 0.005415727209765464, "clip_ratio/region_mean": 0.012750832829624414, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 485.375, "completions/mean_terminated_length": 485.375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.05897569051012397, "epoch": 9.640000385600015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.00454196659848094, "kl": 0.013876991404686123, "learning_rate": 7.462979999767843e-06, "loss": 0.0, "num_tokens": 3497685.0, "reward": 3.8196423053741455, "reward_std": 15.384727478027344, "rewards/rollout_reward_func/mean": 3.8196423053741455, "rewards/rollout_reward_func/std": 15.384727478027344, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.437503814697266, "sampling/sampling_logp_difference/mean": 0.4395808279514313, "step": 241, "step_time": 12.405389255000046 }, { "clip_ratio/high_max": 0.0043288139859214425, "clip_ratio/high_mean": 0.0043288139859214425, "clip_ratio/low_mean": 0.007479593099560589, "clip_ratio/low_min": 0.007479593099560589, "clip_ratio/region_mean": 0.011808407143689692, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 476.875, "completions/mean_terminated_length": 476.875, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "entropy": 0.06069478811696172, "epoch": 9.680000387200016e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014860992087051272, "kl": 0.011720258451532573, "learning_rate": 7.4629799997655724e-06, "loss": 0.0, "num_tokens": 3518208.0, "reward": 6.5878005027771, "reward_std": 12.039593696594238, "rewards/rollout_reward_func/mean": 6.5878005027771, "rewards/rollout_reward_func/std": 12.039594650268555, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.40625, "sampling/sampling_logp_difference/mean": 0.42934584617614746, "step": 242, "step_time": 12.591553901984298 }, { "clip_ratio/high_max": 0.00812077525188215, "clip_ratio/high_mean": 0.00812077525188215, "clip_ratio/low_mean": 0.006089435250032693, "clip_ratio/low_min": 0.006089435250032693, "clip_ratio/region_mean": 0.014210210123565048, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 461.875, "completions/mean_terminated_length": 461.875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.060478531289845705, "epoch": 9.720000388800015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.001686086063273251, "kl": 0.010323008405975997, "learning_rate": 7.462979999763291e-06, "loss": 0.0, "num_tokens": 3538455.0, "reward": -1.1936206817626953, "reward_std": 12.368781089782715, "rewards/rollout_reward_func/mean": -1.1936206817626953, "rewards/rollout_reward_func/std": 12.368781089782715, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.8671875, "sampling/sampling_logp_difference/mean": 0.45586228370666504, "step": 243, "step_time": 12.359489444002975 }, { "clip_ratio/high_max": 0.00432307634036988, "clip_ratio/high_mean": 0.00432307634036988, "clip_ratio/low_mean": 0.005388907738961279, "clip_ratio/low_min": 0.005388907738961279, "clip_ratio/region_mean": 0.009711984021123499, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 476.0, "completions/mean_terminated_length": 476.0, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.05943173170089722, "epoch": 9.760000390400016e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.001254537026397884, "kl": 0.013142870157025754, "learning_rate": 7.462979999760999e-06, "loss": 0.0, "num_tokens": 3558957.0, "reward": 5.946138381958008, "reward_std": 8.593766212463379, "rewards/rollout_reward_func/mean": 5.946138381958008, "rewards/rollout_reward_func/std": 8.593766212463379, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.34375, "sampling/sampling_logp_difference/mean": 0.4301120340824127, "step": 244, "step_time": 12.505282267011353 }, { "clip_ratio/high_max": 0.006040930398739874, "clip_ratio/high_mean": 0.006040930398739874, "clip_ratio/low_mean": 0.005412121943663806, "clip_ratio/low_min": 0.005412121943663806, "clip_ratio/region_mean": 0.01145305234240368, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 485.375, "completions/mean_terminated_length": 485.375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.060887419153004885, "epoch": 9.800000392000015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009134087595157325, "kl": 0.00952870852779597, "learning_rate": 7.4629799997586945e-06, "loss": 0.0, "num_tokens": 3579646.0, "reward": 1.7752292156219482, "reward_std": 8.136784553527832, "rewards/rollout_reward_func/mean": 1.7752292156219482, "rewards/rollout_reward_func/std": 8.136784553527832, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.109375, "sampling/sampling_logp_difference/mean": 0.43284955620765686, "step": 245, "step_time": 16.872464706000756 }, { "clip_ratio/high_max": 0.005525457265321165, "clip_ratio/high_mean": 0.005525457265321165, "clip_ratio/low_mean": 0.005077167006675154, "clip_ratio/low_min": 0.005077167006675154, "clip_ratio/region_mean": 0.01060262427199632, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 483.5625, "completions/mean_terminated_length": 483.5625, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.05976178077980876, "epoch": 9.840000393600016e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.001572477980516851, "kl": 0.012957213795743883, "learning_rate": 7.4629799997563804e-06, "loss": 0.0, "num_tokens": 3600295.0, "reward": 4.100962162017822, "reward_std": 11.323793411254883, "rewards/rollout_reward_func/mean": 4.100962162017822, "rewards/rollout_reward_func/std": 11.323793411254883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.43750762939453, "sampling/sampling_logp_difference/mean": 0.4335477948188782, "step": 246, "step_time": 12.73309612601588 }, { "clip_ratio/high_max": 0.004132567089982331, "clip_ratio/high_mean": 0.004132567089982331, "clip_ratio/low_mean": 0.006270818063057959, "clip_ratio/low_min": 0.006270818063057959, "clip_ratio/region_mean": 0.010403385094832629, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 487.1875, "completions/mean_terminated_length": 487.1875, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.06171296630054712, "epoch": 9.880000395200015e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012757193762809038, "kl": 0.012607700715307146, "learning_rate": 7.4629799997540545e-06, "loss": 0.0, "num_tokens": 3621020.0, "reward": 7.327976226806641, "reward_std": 13.745925903320312, "rewards/rollout_reward_func/mean": 7.327976226806641, "rewards/rollout_reward_func/std": 13.745926856994629, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.28125, "sampling/sampling_logp_difference/mean": 0.41992413997650146, "step": 247, "step_time": 12.586696893995395 }, { "clip_ratio/high_max": 0.003879972209688276, "clip_ratio/high_mean": 0.003879972209688276, "clip_ratio/low_mean": 0.007710011035669595, "clip_ratio/low_min": 0.007710011035669595, "clip_ratio/region_mean": 0.01158998318715021, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 473.1875, "completions/mean_terminated_length": 473.1875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.058004386723041534, "epoch": 9.920000396800016e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.00358656607568264, "kl": 0.013681268610525876, "learning_rate": 7.4629799997517175e-06, "loss": 0.0, "num_tokens": 3641509.0, "reward": 7.924707412719727, "reward_std": 22.758777618408203, "rewards/rollout_reward_func/mean": 7.924707412719727, "rewards/rollout_reward_func/std": 22.75878143310547, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.625, "sampling/sampling_logp_difference/mean": 0.4423797130584717, "step": 248, "step_time": 12.427905294993252 }, { "clip_ratio/high_max": 0.008787000231677666, "clip_ratio/high_mean": 0.008787000231677666, "clip_ratio/low_mean": 0.003998703643446788, "clip_ratio/low_min": 0.003998703643446788, "clip_ratio/region_mean": 0.012785703933332115, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 482.5625, "completions/mean_terminated_length": 482.5625, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.060131560545414686, "epoch": 9.960000398400017e-06, "frac_reward_zero_std": 0.0, "grad_norm": 0.003470468567684293, "kl": 0.014157407218590379, "learning_rate": 7.4629799997493695e-06, "loss": 0.0, "num_tokens": 3662149.0, "reward": -0.8538024425506592, "reward_std": 6.778453826904297, "rewards/rollout_reward_func/mean": -0.8538024425506592, "rewards/rollout_reward_func/std": 6.778453826904297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.625, "sampling/sampling_logp_difference/mean": 0.4286072254180908, "step": 249, "step_time": 12.452468871008023 }, { "clip_ratio/high_max": 0.007543086830992252, "clip_ratio/high_mean": 0.007543086830992252, "clip_ratio/low_mean": 0.004331797448685393, "clip_ratio/low_min": 0.004331797448685393, "clip_ratio/region_mean": 0.011874884250573814, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 490.4375, "completions/mean_terminated_length": 490.4375, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "entropy": 0.056148226372897625, "epoch": 1.0000000400000016e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015758451772853732, "kl": 0.011939728516153991, "learning_rate": 7.4629799997470106e-06, "loss": 0.0, "num_tokens": 3682933.0, "reward": 0.48137813806533813, "reward_std": 6.648929119110107, "rewards/rollout_reward_func/mean": 0.48137813806533813, "rewards/rollout_reward_func/std": 6.648929595947266, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.78125, "sampling/sampling_logp_difference/mean": 0.41050925850868225, "step": 250, "step_time": 12.659231566991366 }, { "clip_ratio/high_max": 0.008757062489166856, "clip_ratio/high_mean": 0.008757062489166856, "clip_ratio/low_mean": 0.005349813814973459, "clip_ratio/low_min": 0.005349813814973459, "clip_ratio/region_mean": 0.014106876333244145, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 470.3125, "completions/mean_terminated_length": 470.3125, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.05931273940950632, "epoch": 1.0040000401600017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015011584619060159, "kl": 0.011135238804854453, "learning_rate": 7.4629799997446406e-06, "loss": 0.0, "num_tokens": 3703356.0, "reward": 0.2585749626159668, "reward_std": 7.887979984283447, "rewards/rollout_reward_func/mean": 0.2585749626159668, "rewards/rollout_reward_func/std": 7.8879804611206055, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.0625, "sampling/sampling_logp_difference/mean": 0.45631733536720276, "step": 251, "step_time": 12.309525842007133 }, { "clip_ratio/high_max": 0.006828498502727598, "clip_ratio/high_mean": 0.006828498502727598, "clip_ratio/low_mean": 0.005991935759084299, "clip_ratio/low_min": 0.005991935759084299, "clip_ratio/region_mean": 0.012820434290915728, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 482.8125, "completions/mean_terminated_length": 482.8125, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.056756220292299986, "epoch": 1.0080000403200016e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014632738893851638, "kl": 0.010493356676306576, "learning_rate": 7.4629799997422596e-06, "loss": 0.0, "num_tokens": 3723988.0, "reward": -0.6187577247619629, "reward_std": 7.639528751373291, "rewards/rollout_reward_func/mean": -0.6187577247619629, "rewards/rollout_reward_func/std": 7.639528751373291, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.68756866455078, "sampling/sampling_logp_difference/mean": 0.43022042512893677, "step": 252, "step_time": 12.477801801003807 }, { "clip_ratio/high_max": 0.005545279069337994, "clip_ratio/high_mean": 0.005545279069337994, "clip_ratio/low_mean": 0.006151669775135815, "clip_ratio/low_min": 0.006151669775135815, "clip_ratio/region_mean": 0.011696948786266148, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 479.4375, "completions/mean_terminated_length": 479.4375, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "entropy": 0.06023745099082589, "epoch": 1.0120000404800017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014754094881936908, "kl": 0.01279712188988924, "learning_rate": 7.4629799997398675e-06, "loss": 0.0, "num_tokens": 3744572.0, "reward": 6.42025899887085, "reward_std": 11.48573112487793, "rewards/rollout_reward_func/mean": 6.42025899887085, "rewards/rollout_reward_func/std": 11.485732078552246, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.8125, "sampling/sampling_logp_difference/mean": 0.43106433749198914, "step": 253, "step_time": 12.367979934009782 }, { "clip_ratio/high_max": 0.006086974870413542, "clip_ratio/high_mean": 0.006086974870413542, "clip_ratio/low_mean": 0.006955694290809333, "clip_ratio/low_min": 0.006955694290809333, "clip_ratio/region_mean": 0.013042669277638197, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 469.375, "completions/mean_terminated_length": 469.375, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.05657997727394104, "epoch": 1.0160000406400016e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015973574481904507, "kl": 0.011421989183872938, "learning_rate": 7.4629799997374645e-06, "loss": 0.0, "num_tokens": 3764970.0, "reward": 4.376631259918213, "reward_std": 9.203346252441406, "rewards/rollout_reward_func/mean": 4.376631259918213, "rewards/rollout_reward_func/std": 9.203346252441406, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.21875, "sampling/sampling_logp_difference/mean": 0.4417969286441803, "step": 254, "step_time": 12.387225895974552 }, { "clip_ratio/high_max": 0.0037600568030029535, "clip_ratio/high_mean": 0.0037600568030029535, "clip_ratio/low_mean": 0.00768405239796266, "clip_ratio/low_min": 0.00768405239796266, "clip_ratio/region_mean": 0.011444109200965613, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 486.9375, "completions/mean_terminated_length": 486.9375, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "entropy": 0.06312739662826061, "epoch": 1.0200000408000016e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010684729786589742, "kl": 0.008741641591768712, "learning_rate": 7.4629799997350505e-06, "loss": 0.0, "num_tokens": 3785685.0, "reward": 3.6158649921417236, "reward_std": 9.119112968444824, "rewards/rollout_reward_func/mean": 3.6158649921417236, "rewards/rollout_reward_func/std": 9.119112968444824, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.8125, "sampling/sampling_logp_difference/mean": 0.3905128836631775, "step": 255, "step_time": 12.43623554900114 }, { "clip_ratio/high_max": 0.004418649594299495, "clip_ratio/high_mean": 0.004418649594299495, "clip_ratio/low_mean": 0.009997335320804268, "clip_ratio/low_min": 0.009997335320804268, "clip_ratio/region_mean": 0.014415984973311424, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 469.0625, "completions/mean_terminated_length": 469.0625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.06023793155327439, "epoch": 1.0240000409600016e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011942661367356777, "kl": 0.01233168161706999, "learning_rate": 7.462979999732626e-06, "loss": 0.0, "num_tokens": 3806072.0, "reward": 5.521359920501709, "reward_std": 9.814002990722656, "rewards/rollout_reward_func/mean": 5.521359920501709, "rewards/rollout_reward_func/std": 9.814002990722656, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.85222625732422, "sampling/sampling_logp_difference/mean": 0.4507375955581665, "step": 256, "step_time": 12.298882849005167 }, { "clip_ratio/high_max": 0.004340314655564725, "clip_ratio/high_mean": 0.004340314655564725, "clip_ratio/low_mean": 0.007534513715654612, "clip_ratio/low_min": 0.007534513715654612, "clip_ratio/region_mean": 0.011874828371219337, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 489.75, "completions/mean_terminated_length": 489.75, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "entropy": 0.061934515833854675, "epoch": 1.0280000411200016e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011192240053787827, "kl": 0.010211084445472807, "learning_rate": 7.462979999730189e-06, "loss": 0.0, "num_tokens": 3826835.0, "reward": 7.558114051818848, "reward_std": 14.487411499023438, "rewards/rollout_reward_func/mean": 7.558114051818848, "rewards/rollout_reward_func/std": 14.487411499023438, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.671875, "sampling/sampling_logp_difference/mean": 0.44078385829925537, "step": 257, "step_time": 12.584944314985478 }, { "clip_ratio/high_max": 0.00611986372678075, "clip_ratio/high_mean": 0.00611986372678075, "clip_ratio/low_mean": 0.006488997198175639, "clip_ratio/low_min": 0.006488997198175639, "clip_ratio/region_mean": 0.012608860910404474, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 454.3125, "completions/mean_terminated_length": 454.3125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.055425433441996574, "epoch": 1.0320000412800017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036336067132651806, "kl": 0.009645639540394768, "learning_rate": 7.462979999727742e-06, "loss": 0.0, "num_tokens": 3846966.0, "reward": 10.433293342590332, "reward_std": 25.903295516967773, "rewards/rollout_reward_func/mean": 10.433293342590332, "rewards/rollout_reward_func/std": 25.90329360961914, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.875, "sampling/sampling_logp_difference/mean": 0.4417724013328552, "step": 258, "step_time": 12.249064412004373 }, { "clip_ratio/high_max": 0.005511438241228461, "clip_ratio/high_mean": 0.005511438241228461, "clip_ratio/low_mean": 0.005774273071438074, "clip_ratio/low_min": 0.005774273071438074, "clip_ratio/region_mean": 0.011285711312666535, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 476.5, "completions/mean_terminated_length": 476.5, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 0.05703417770564556, "epoch": 1.0360000414400016e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008364490931853652, "kl": 0.008797154820058495, "learning_rate": 7.462979999725284e-06, "loss": 0.0, "num_tokens": 3867482.0, "reward": 0.331322580575943, "reward_std": 4.698633193969727, "rewards/rollout_reward_func/mean": 0.331322580575943, "rewards/rollout_reward_func/std": 4.698633193969727, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.718753814697266, "sampling/sampling_logp_difference/mean": 0.44211918115615845, "step": 259, "step_time": 12.49705230100517 }, { "clip_ratio/high_max": 0.004420571553055197, "clip_ratio/high_mean": 0.004420571553055197, "clip_ratio/low_mean": 0.006088959751650691, "clip_ratio/low_min": 0.006088959751650691, "clip_ratio/region_mean": 0.010509531304705888, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 461.375, "completions/mean_terminated_length": 461.375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.06019561970606446, "epoch": 1.0400000416000017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0018401237903162837, "kl": 0.013471962418407202, "learning_rate": 7.462979999722815e-06, "loss": 0.0, "num_tokens": 3887749.0, "reward": 9.291725158691406, "reward_std": 24.25479507446289, "rewards/rollout_reward_func/mean": 9.291725158691406, "rewards/rollout_reward_func/std": 24.25479507446289, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.125, "sampling/sampling_logp_difference/mean": 0.43625226616859436, "step": 260, "step_time": 12.315446009997686 }, { "clip_ratio/high_max": 0.007884113932959735, "clip_ratio/high_mean": 0.007884113932959735, "clip_ratio/low_mean": 0.00430821516783908, "clip_ratio/low_min": 0.00430821516783908, "clip_ratio/region_mean": 0.012192329159006476, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 477.6875, "completions/mean_terminated_length": 477.6875, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "entropy": 0.05937721114605665, "epoch": 1.0440000417600016e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0018535807030275464, "kl": 0.011027341126464307, "learning_rate": 7.462979999720334e-06, "loss": 0.0, "num_tokens": 3908292.0, "reward": 3.352263927459717, "reward_std": 9.870925903320312, "rewards/rollout_reward_func/mean": 3.352263927459717, "rewards/rollout_reward_func/std": 9.870926856994629, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 32.75, "sampling/sampling_logp_difference/mean": 0.4404783546924591, "step": 261, "step_time": 12.372810684006254 }, { "clip_ratio/high_max": 0.004372688650619239, "clip_ratio/high_mean": 0.004372688650619239, "clip_ratio/low_mean": 0.006906841590534896, "clip_ratio/low_min": 0.006906841590534896, "clip_ratio/region_mean": 0.011279530241154134, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 482.5, "completions/mean_terminated_length": 482.5, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.06207799818366766, "epoch": 1.0480000419200017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009643215453252196, "kl": 0.013935284689068794, "learning_rate": 7.462979999717843e-06, "loss": 0.0, "num_tokens": 3928917.0, "reward": 4.966146945953369, "reward_std": 9.476219177246094, "rewards/rollout_reward_func/mean": 4.966146945953369, "rewards/rollout_reward_func/std": 9.47622013092041, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 30.437522888183594, "sampling/sampling_logp_difference/mean": 0.42877206206321716, "step": 262, "step_time": 12.503842401994916 }, { "clip_ratio/high_max": 0.0057781032810453326, "clip_ratio/high_mean": 0.0057781032810453326, "clip_ratio/low_mean": 0.006406768632587045, "clip_ratio/low_min": 0.006406768632587045, "clip_ratio/region_mean": 0.012184871942736208, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 488.875, "completions/mean_terminated_length": 488.875, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "entropy": 0.05529961967840791, "epoch": 1.0520000420800016e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012745552230626345, "kl": 0.014336147694848478, "learning_rate": 7.462979999715341e-06, "loss": 0.0, "num_tokens": 3949676.0, "reward": 8.482166290283203, "reward_std": 11.522090911865234, "rewards/rollout_reward_func/mean": 8.482166290283203, "rewards/rollout_reward_func/std": 11.52209186553955, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.25, "sampling/sampling_logp_difference/mean": 0.4381629526615143, "step": 263, "step_time": 12.530874536998454 }, { "clip_ratio/high_max": 0.00432308716699481, "clip_ratio/high_mean": 0.00432308716699481, "clip_ratio/low_mean": 0.007698994129896164, "clip_ratio/low_min": 0.007698994129896164, "clip_ratio/region_mean": 0.012022081296890974, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 461.5625, "completions/mean_terminated_length": 461.5625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.06207476230338216, "epoch": 1.0560000422400017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.001411327044479549, "kl": 0.010092368524055928, "learning_rate": 7.462979999712827e-06, "loss": 0.0, "num_tokens": 3969924.0, "reward": 1.3122743368148804, "reward_std": 4.453250408172607, "rewards/rollout_reward_func/mean": 1.3122743368148804, "rewards/rollout_reward_func/std": 4.453250408172607, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.0, "sampling/sampling_logp_difference/mean": 0.4674755930900574, "step": 264, "step_time": 12.25840390999656 }, { "clip_ratio/high_max": 0.006424425635486841, "clip_ratio/high_mean": 0.006424425635486841, "clip_ratio/low_mean": 0.00592882459750399, "clip_ratio/low_min": 0.00592882459750399, "clip_ratio/region_mean": 0.01235325017478317, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 475.875, "completions/mean_terminated_length": 475.875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 0.059884792659431696, "epoch": 1.0600000424000018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00783798098564148, "kl": 0.010374104138463736, "learning_rate": 7.462979999710302e-06, "loss": 0.0, "num_tokens": 3990449.0, "reward": 2.2430131435394287, "reward_std": 9.809505462646484, "rewards/rollout_reward_func/mean": 2.2430131435394287, "rewards/rollout_reward_func/std": 9.8095064163208, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.593753814697266, "sampling/sampling_logp_difference/mean": 0.4413257837295532, "step": 265, "step_time": 12.329705101008585 }, { "clip_ratio/high_max": 0.004951957613229752, "clip_ratio/high_mean": 0.004951957613229752, "clip_ratio/low_mean": 0.007406344870105386, "clip_ratio/low_min": 0.007406344870105386, "clip_ratio/region_mean": 0.012358302366919816, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 478.875, "completions/mean_terminated_length": 478.875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.05654791509732604, "epoch": 1.0640000425600017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015851939097046852, "kl": 0.007371782674454153, "learning_rate": 7.462979999707767e-06, "loss": 0.0, "num_tokens": 4011011.0, "reward": 3.627781391143799, "reward_std": 7.626430034637451, "rewards/rollout_reward_func/mean": 3.627781391143799, "rewards/rollout_reward_func/std": 7.626430511474609, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.875, "sampling/sampling_logp_difference/mean": 0.4243747889995575, "step": 266, "step_time": 12.342798263001896 }, { "clip_ratio/high_max": 0.00584646308561787, "clip_ratio/high_mean": 0.00584646308561787, "clip_ratio/low_mean": 0.003695493855047971, "clip_ratio/low_min": 0.003695493855047971, "clip_ratio/region_mean": 0.009541956940665841, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 478.25, "completions/mean_terminated_length": 478.25, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.0623484798707068, "epoch": 1.0680000427200018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010341159068048, "kl": 0.006667473295237869, "learning_rate": 7.4629799997052205e-06, "loss": 0.0, "num_tokens": 4031570.0, "reward": 2.1012163162231445, "reward_std": 6.595694065093994, "rewards/rollout_reward_func/mean": 2.1012163162231445, "rewards/rollout_reward_func/std": 6.595694065093994, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.9375, "sampling/sampling_logp_difference/mean": 0.4408174455165863, "step": 267, "step_time": 12.40998328600108 }, { "clip_ratio/high_max": 0.007454435370163992, "clip_ratio/high_mean": 0.007454435370163992, "clip_ratio/low_mean": 0.005442031833808869, "clip_ratio/low_min": 0.005442031833808869, "clip_ratio/region_mean": 0.012896467233076692, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 470.625, "completions/mean_terminated_length": 470.625, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.05874144611880183, "epoch": 1.0720000428800017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011273957788944244, "kl": 0.011611878639087081, "learning_rate": 7.4629799997026625e-06, "loss": 0.0, "num_tokens": 4051996.0, "reward": 3.861449956893921, "reward_std": 10.771092414855957, "rewards/rollout_reward_func/mean": 3.861449956893921, "rewards/rollout_reward_func/std": 10.771092414855957, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.09375, "sampling/sampling_logp_difference/mean": 0.43682000041007996, "step": 268, "step_time": 12.350894674011215 }, { "clip_ratio/high_max": 0.008825042168609798, "clip_ratio/high_mean": 0.008825042168609798, "clip_ratio/low_mean": 0.0035038864152738824, "clip_ratio/low_min": 0.0035038864152738824, "clip_ratio/region_mean": 0.012328928452916443, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 485.4375, "completions/mean_terminated_length": 485.4375, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "entropy": 0.059671266470104456, "epoch": 1.0760000430400018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013947131810709834, "kl": 0.012244625337189063, "learning_rate": 7.462979999700093e-06, "loss": 0.0, "num_tokens": 4072688.0, "reward": -0.44163814187049866, "reward_std": 3.9991769790649414, "rewards/rollout_reward_func/mean": -0.44163814187049866, "rewards/rollout_reward_func/std": 3.9991772174835205, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.5, "sampling/sampling_logp_difference/mean": 0.43713900446891785, "step": 269, "step_time": 12.610149334999733 }, { "clip_ratio/high_max": 0.006754408241249621, "clip_ratio/high_mean": 0.006754408241249621, "clip_ratio/low_mean": 0.006456306320615113, "clip_ratio/low_min": 0.006456306320615113, "clip_ratio/region_mean": 0.013210714561864734, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 473.4375, "completions/mean_terminated_length": 473.4375, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.059659762773662806, "epoch": 1.0800000432000017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00106492405757308, "kl": 0.009237073245458305, "learning_rate": 7.462979999697513e-06, "loss": 0.0, "num_tokens": 4093155.0, "reward": 6.1496686935424805, "reward_std": 10.495515823364258, "rewards/rollout_reward_func/mean": 6.1496686935424805, "rewards/rollout_reward_func/std": 10.495516777038574, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.6640625, "sampling/sampling_logp_difference/mean": 0.4375869929790497, "step": 270, "step_time": 12.466007535011158 }, { "clip_ratio/high_max": 0.003438707208260894, "clip_ratio/high_mean": 0.003438707208260894, "clip_ratio/low_mean": 0.009398712951224297, "clip_ratio/low_min": 0.009398712951224297, "clip_ratio/region_mean": 0.012837419984862208, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 471.8125, "completions/mean_terminated_length": 471.8125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.06179855205118656, "epoch": 1.0840000433600018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.000914617907255888, "kl": 0.011420761060435325, "learning_rate": 7.462979999694922e-06, "loss": 0.0, "num_tokens": 4113618.0, "reward": 10.626422882080078, "reward_std": 22.82876205444336, "rewards/rollout_reward_func/mean": 10.626422882080078, "rewards/rollout_reward_func/std": 22.82876205444336, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.431640625, "sampling/sampling_logp_difference/mean": 0.45161572098731995, "step": 271, "step_time": 12.520653734994994 }, { "clip_ratio/high_max": 0.005592918489128351, "clip_ratio/high_mean": 0.005592918489128351, "clip_ratio/low_mean": 0.005594978923909366, "clip_ratio/low_min": 0.005594978923909366, "clip_ratio/region_mean": 0.011187897413037717, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 480.6875, "completions/mean_terminated_length": 480.6875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.061326506081968546, "epoch": 1.0880000435200017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022123928647488356, "kl": 0.00918904657009989, "learning_rate": 7.462979999692321e-06, "loss": 0.0, "num_tokens": 4134225.0, "reward": 4.634449005126953, "reward_std": 9.206880569458008, "rewards/rollout_reward_func/mean": 4.634449005126953, "rewards/rollout_reward_func/std": 9.206880569458008, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.9453125, "sampling/sampling_logp_difference/mean": 0.42879122495651245, "step": 272, "step_time": 12.370798939984525 }, { "clip_ratio/high_max": 0.0037993506412021816, "clip_ratio/high_mean": 0.0037993506412021816, "clip_ratio/low_mean": 0.009132465231232345, "clip_ratio/low_min": 0.009132465231232345, "clip_ratio/region_mean": 0.012931815930642188, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 463.25, "completions/mean_terminated_length": 463.25, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.06293042376637459, "epoch": 1.0920000436800017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012524400372058153, "kl": 0.009548969595925882, "learning_rate": 7.462979999689708e-06, "loss": 0.0, "num_tokens": 4154535.0, "reward": 6.522550582885742, "reward_std": 21.918594360351562, "rewards/rollout_reward_func/mean": 6.522550582885742, "rewards/rollout_reward_func/std": 21.918594360351562, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.4375, "sampling/sampling_logp_difference/mean": 0.4446749985218048, "step": 273, "step_time": 12.306549353001174 }, { "clip_ratio/high_max": 0.007518723839893937, "clip_ratio/high_mean": 0.007518723839893937, "clip_ratio/low_mean": 0.00578542920993641, "clip_ratio/low_min": 0.00578542920993641, "clip_ratio/region_mean": 0.01330415322445333, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 471.5625, "completions/mean_terminated_length": 471.5625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "entropy": 0.06090191937983036, "epoch": 1.0960000438400018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010918100597336888, "kl": 0.011242087290156633, "learning_rate": 7.462979999687084e-06, "loss": 0.0, "num_tokens": 4174958.0, "reward": -0.7897244095802307, "reward_std": 7.027614593505859, "rewards/rollout_reward_func/mean": -0.7897244095802307, "rewards/rollout_reward_func/std": 7.027614116668701, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.5322265625, "sampling/sampling_logp_difference/mean": 0.44432368874549866, "step": 274, "step_time": 12.4101565829842 }, { "clip_ratio/high_max": 0.005192662705667317, "clip_ratio/high_mean": 0.005192662705667317, "clip_ratio/low_mean": 0.006590674282051623, "clip_ratio/low_min": 0.006590674282051623, "clip_ratio/region_mean": 0.011783336929511279, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 485.0, "completions/mean_terminated_length": 485.0, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.05838797939941287, "epoch": 1.1000000440000017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014052526094019413, "kl": 0.00874934991588816, "learning_rate": 7.462979999684449e-06, "loss": 0.0, "num_tokens": 4195629.0, "reward": 5.701841831207275, "reward_std": 11.788394927978516, "rewards/rollout_reward_func/mean": 5.701841831207275, "rewards/rollout_reward_func/std": 11.788395881652832, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.0625, "sampling/sampling_logp_difference/mean": 0.43115708231925964, "step": 275, "step_time": 12.433527405999484 }, { "clip_ratio/high_max": 0.006716331175994128, "clip_ratio/high_mean": 0.006716331175994128, "clip_ratio/low_mean": 0.003914693254046142, "clip_ratio/low_min": 0.003914693254046142, "clip_ratio/region_mean": 0.010631024313624948, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 476.9375, "completions/mean_terminated_length": 476.9375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.0559503841213882, "epoch": 1.1040000441600018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.000719607574865222, "kl": 0.012618855980690569, "learning_rate": 7.462979999681803e-06, "loss": 0.0, "num_tokens": 4216153.0, "reward": 5.177097797393799, "reward_std": 9.66783618927002, "rewards/rollout_reward_func/mean": 5.177097797393799, "rewards/rollout_reward_func/std": 9.66783618927002, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.75, "sampling/sampling_logp_difference/mean": 0.43632665276527405, "step": 276, "step_time": 12.402215536007134 }, { "clip_ratio/high_max": 0.006559659115737304, "clip_ratio/high_mean": 0.006559659115737304, "clip_ratio/low_mean": 0.005120955203892663, "clip_ratio/low_min": 0.005120955203892663, "clip_ratio/region_mean": 0.011680614203214645, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 474.0625, "completions/mean_terminated_length": 474.0625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.05912518873810768, "epoch": 1.1080000443200017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014399357605725527, "kl": 0.016669759526848793, "learning_rate": 7.462979999679146e-06, "loss": 0.0, "num_tokens": 4236645.0, "reward": 10.580717086791992, "reward_std": 25.418909072875977, "rewards/rollout_reward_func/mean": 10.580717086791992, "rewards/rollout_reward_func/std": 25.418909072875977, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.3125, "sampling/sampling_logp_difference/mean": 0.4360669255256653, "step": 277, "step_time": 12.418308398009685 }, { "clip_ratio/high_max": 0.005945313605479896, "clip_ratio/high_mean": 0.005945313605479896, "clip_ratio/low_mean": 0.0063522676937282085, "clip_ratio/low_min": 0.0063522676937282085, "clip_ratio/region_mean": 0.012297581299208105, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 472.6875, "completions/mean_terminated_length": 472.6875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.05885827774181962, "epoch": 1.1120000444800018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0023197599221020937, "kl": 0.01188258407637477, "learning_rate": 7.462979999676477e-06, "loss": 0.0, "num_tokens": 4257117.0, "reward": 7.717413902282715, "reward_std": 26.061771392822266, "rewards/rollout_reward_func/mean": 7.717413902282715, "rewards/rollout_reward_func/std": 26.061771392822266, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.812503814697266, "sampling/sampling_logp_difference/mean": 0.4361405372619629, "step": 278, "step_time": 12.322495529988373 }, { "clip_ratio/high_max": 0.005348091683117673, "clip_ratio/high_mean": 0.005348091683117673, "clip_ratio/low_mean": 0.00727691181236878, "clip_ratio/low_min": 0.00727691181236878, "clip_ratio/region_mean": 0.012625003349967301, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 477.3125, "completions/mean_terminated_length": 477.3125, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "entropy": 0.05997967114672065, "epoch": 1.1160000446400017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002667461521923542, "kl": 0.009852950664935634, "learning_rate": 7.462979999673797e-06, "loss": 0.0, "num_tokens": 4277648.0, "reward": 4.140770435333252, "reward_std": 11.348020553588867, "rewards/rollout_reward_func/mean": 4.140770435333252, "rewards/rollout_reward_func/std": 11.348020553588867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.0625, "sampling/sampling_logp_difference/mean": 0.4272199869155884, "step": 279, "step_time": 12.500515396000992 }, { "clip_ratio/high_max": 0.005127100041136146, "clip_ratio/high_mean": 0.005127100041136146, "clip_ratio/low_mean": 0.007319469295907766, "clip_ratio/low_min": 0.007319469295907766, "clip_ratio/region_mean": 0.01244656927883625, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 464.5, "completions/mean_terminated_length": 464.5, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.0658340360969305, "epoch": 1.1200000448000018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010641287080943584, "kl": 0.008458018448436633, "learning_rate": 7.462979999671107e-06, "loss": 0.0, "num_tokens": 4297990.0, "reward": 5.7715325355529785, "reward_std": 25.187725067138672, "rewards/rollout_reward_func/mean": 5.7715325355529785, "rewards/rollout_reward_func/std": 25.187725067138672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.15625, "sampling/sampling_logp_difference/mean": 0.4288370609283447, "step": 280, "step_time": 12.27823407100368 }, { "clip_ratio/high_max": 0.004979482997441664, "clip_ratio/high_mean": 0.004979482997441664, "clip_ratio/low_mean": 0.005721139023080468, "clip_ratio/low_min": 0.005721139023080468, "clip_ratio/region_mean": 0.010700622049625963, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 480.0625, "completions/mean_terminated_length": 480.0625, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.06035555340349674, "epoch": 1.1240000449600017e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.001452988595701754, "kl": 0.013508589123375714, "learning_rate": 7.462979999668406e-06, "loss": 0.0, "num_tokens": 4318579.0, "reward": 3.7368030548095703, "reward_std": 10.109482765197754, "rewards/rollout_reward_func/mean": 3.7368030548095703, "rewards/rollout_reward_func/std": 10.10948371887207, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.59375, "sampling/sampling_logp_difference/mean": 0.41544458270072937, "step": 281, "step_time": 12.63631542899384 }, { "clip_ratio/high_max": 0.005520201462786645, "clip_ratio/high_mean": 0.005520201462786645, "clip_ratio/low_mean": 0.0058666408294811845, "clip_ratio/low_min": 0.0058666408294811845, "clip_ratio/region_mean": 0.011386842234060168, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 484.8125, "completions/mean_terminated_length": 484.8125, "completions/min_length": 460.0, "completions/min_terminated_length": 460.0, "entropy": 0.061759854201227427, "epoch": 1.1280000451200018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002007812261581421, "kl": 0.00998575979610905, "learning_rate": 7.462979999665694e-06, "loss": 0.0, "num_tokens": 4339258.0, "reward": 1.5153712034225464, "reward_std": 4.384790897369385, "rewards/rollout_reward_func/mean": 1.5153712034225464, "rewards/rollout_reward_func/std": 4.384790897369385, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.6171875, "sampling/sampling_logp_difference/mean": 0.43113160133361816, "step": 282, "step_time": 12.494548136004596 }, { "clip_ratio/high_max": 0.005130304314661771, "clip_ratio/high_mean": 0.005130304314661771, "clip_ratio/low_mean": 0.006115434312960133, "clip_ratio/low_min": 0.006115434312960133, "clip_ratio/region_mean": 0.011245738598518074, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 479.4375, "completions/mean_terminated_length": 479.4375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.06413915660232306, "epoch": 1.1320000452800019e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014538420364260674, "kl": 0.012599464564118534, "learning_rate": 7.4629799996629705e-06, "loss": 0.0, "num_tokens": 4359838.0, "reward": 6.633513927459717, "reward_std": 10.88791561126709, "rewards/rollout_reward_func/mean": 6.633513927459717, "rewards/rollout_reward_func/std": 10.88791561126709, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.125, "sampling/sampling_logp_difference/mean": 0.43903622031211853, "step": 283, "step_time": 12.440380214007746 }, { "clip_ratio/high_max": 0.006402397761121392, "clip_ratio/high_mean": 0.006402397761121392, "clip_ratio/low_mean": 0.005169383599422872, "clip_ratio/low_min": 0.005169383599422872, "clip_ratio/region_mean": 0.011571781302336603, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 484.875, "completions/mean_terminated_length": 484.875, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.06048109149560332, "epoch": 1.1360000454400018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024668793193995953, "kl": 0.012110393785405904, "learning_rate": 7.4629799996602354e-06, "loss": 0.0, "num_tokens": 4380519.0, "reward": 5.067960739135742, "reward_std": 10.687834739685059, "rewards/rollout_reward_func/mean": 5.067960739135742, "rewards/rollout_reward_func/std": 10.687834739685059, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.5, "sampling/sampling_logp_difference/mean": 0.4175453782081604, "step": 284, "step_time": 12.633495148002112 }, { "clip_ratio/high_max": 0.005658332433085889, "clip_ratio/high_mean": 0.005658332433085889, "clip_ratio/low_mean": 0.006272003083722666, "clip_ratio/low_min": 0.006272003083722666, "clip_ratio/region_mean": 0.011930335487704724, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 464.4375, "completions/mean_terminated_length": 464.4375, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.061753932386636734, "epoch": 1.1400000456000019e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.001247489359229803, "kl": 0.008844703726936132, "learning_rate": 7.462979999657491e-06, "loss": 0.0, "num_tokens": 4400828.0, "reward": 0.8020138740539551, "reward_std": 8.948838233947754, "rewards/rollout_reward_func/mean": 0.8020138740539551, "rewards/rollout_reward_func/std": 8.94883918762207, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.125, "sampling/sampling_logp_difference/mean": 0.4513135552406311, "step": 285, "step_time": 12.365952976011613 }, { "clip_ratio/high_max": 0.0040894217672757804, "clip_ratio/high_mean": 0.0040894217672757804, "clip_ratio/low_mean": 0.005832715629367158, "clip_ratio/low_min": 0.005832715629367158, "clip_ratio/region_mean": 0.009922137425746769, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 472.875, "completions/mean_terminated_length": 472.875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.06184629164636135, "epoch": 1.1440000457600018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011110268533229828, "kl": 0.013684421428479254, "learning_rate": 7.462979999654733e-06, "loss": 0.0, "num_tokens": 4421279.0, "reward": 7.799790382385254, "reward_std": 13.634159088134766, "rewards/rollout_reward_func/mean": 7.799790382385254, "rewards/rollout_reward_func/std": 13.634159088134766, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.453125, "sampling/sampling_logp_difference/mean": 0.4484032392501831, "step": 286, "step_time": 12.431199105005362 }, { "clip_ratio/high_max": 0.003994017228251323, "clip_ratio/high_mean": 0.003994017228251323, "clip_ratio/low_mean": 0.0059050037525594234, "clip_ratio/low_min": 0.0059050037525594234, "clip_ratio/region_mean": 0.009899020951706916, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 482.875, "completions/mean_terminated_length": 482.875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.0635405438952148, "epoch": 1.1480000459200019e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.001257132040336728, "kl": 0.00905425357632339, "learning_rate": 7.462979999651966e-06, "loss": 0.0, "num_tokens": 4441906.0, "reward": 7.659509658813477, "reward_std": 16.166481018066406, "rewards/rollout_reward_func/mean": 7.659509658813477, "rewards/rollout_reward_func/std": 16.16648292541504, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.20988845825195, "sampling/sampling_logp_difference/mean": 0.42244622111320496, "step": 287, "step_time": 12.70594268500281 }, { "clip_ratio/high_max": 0.005951115279458463, "clip_ratio/high_mean": 0.005951115279458463, "clip_ratio/low_mean": 0.006586950330529362, "clip_ratio/low_min": 0.006586950330529362, "clip_ratio/region_mean": 0.012538065551780164, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 468.8125, "completions/mean_terminated_length": 468.8125, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.06043345807120204, "epoch": 1.1520000460800018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013255416415631771, "kl": 0.008739405020605773, "learning_rate": 7.462979999649187e-06, "loss": 0.0, "num_tokens": 4462285.0, "reward": 2.827208995819092, "reward_std": 8.63088607788086, "rewards/rollout_reward_func/mean": 2.827208995819092, "rewards/rollout_reward_func/std": 8.63088607788086, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.96875, "sampling/sampling_logp_difference/mean": 0.4330766201019287, "step": 288, "step_time": 12.481659836987092 }, { "clip_ratio/high_max": 0.006276577652897686, "clip_ratio/high_mean": 0.006276577652897686, "clip_ratio/low_mean": 0.005595754133537412, "clip_ratio/low_min": 0.005595754133537412, "clip_ratio/region_mean": 0.011872331728227437, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 479.3125, "completions/mean_terminated_length": 479.3125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 0.0594133366830647, "epoch": 1.1560000462400019e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013208533637225628, "kl": 0.007868419721489772, "learning_rate": 7.4629799996463974e-06, "loss": 0.0, "num_tokens": 4482860.0, "reward": 1.073749303817749, "reward_std": 6.844740390777588, "rewards/rollout_reward_func/mean": 1.073749303817749, "rewards/rollout_reward_func/std": 6.844740867614746, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.375, "sampling/sampling_logp_difference/mean": 0.42820778489112854, "step": 289, "step_time": 12.727329614986957 }, { "clip_ratio/high_max": 0.00436265574535355, "clip_ratio/high_mean": 0.00436265574535355, "clip_ratio/low_mean": 0.006883917783852667, "clip_ratio/low_min": 0.006883917783852667, "clip_ratio/region_mean": 0.011246573529206216, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 475.875, "completions/mean_terminated_length": 475.875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.06176292849704623, "epoch": 1.160000046400002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0020133706275373697, "kl": 0.008996854012366384, "learning_rate": 7.462979999643597e-06, "loss": 0.0, "num_tokens": 4503399.0, "reward": 6.517318248748779, "reward_std": 25.067806243896484, "rewards/rollout_reward_func/mean": 6.517318248748779, "rewards/rollout_reward_func/std": 25.067808151245117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.8984375, "sampling/sampling_logp_difference/mean": 0.4431551396846771, "step": 290, "step_time": 12.491659913990588 }, { "clip_ratio/high_max": 0.006164493534015492, "clip_ratio/high_mean": 0.006164493534015492, "clip_ratio/low_mean": 0.0057983906008303165, "clip_ratio/low_min": 0.0057983906008303165, "clip_ratio/region_mean": 0.0119628842221573, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 476.875, "completions/mean_terminated_length": 476.875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.05897283321246505, "epoch": 1.1640000465600018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.001489353715442121, "kl": 0.010368095012381673, "learning_rate": 7.462979999640785e-06, "loss": 0.0, "num_tokens": 4523953.0, "reward": 6.248405933380127, "reward_std": 24.831350326538086, "rewards/rollout_reward_func/mean": 6.248405933380127, "rewards/rollout_reward_func/std": 24.831350326538086, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.96875, "sampling/sampling_logp_difference/mean": 0.4415266215801239, "step": 291, "step_time": 12.653455136991397 }, { "clip_ratio/high_max": 0.0034421161690261215, "clip_ratio/high_mean": 0.0034421161690261215, "clip_ratio/low_mean": 0.008090815506875515, "clip_ratio/low_min": 0.008090815506875515, "clip_ratio/region_mean": 0.011532931705005467, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 483.25, "completions/mean_terminated_length": 483.25, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.05824081739410758, "epoch": 1.168000046720002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0019399435259401798, "kl": 0.008363369444850832, "learning_rate": 7.462979999637962e-06, "loss": 0.0, "num_tokens": 4544634.0, "reward": 11.665218353271484, "reward_std": 22.42340087890625, "rewards/rollout_reward_func/mean": 11.665218353271484, "rewards/rollout_reward_func/std": 22.423402786254883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.062503814697266, "sampling/sampling_logp_difference/mean": 0.4290911853313446, "step": 292, "step_time": 12.452575489012816 }, { "clip_ratio/high_max": 0.003972196718677878, "clip_ratio/high_mean": 0.003972196718677878, "clip_ratio/low_mean": 0.007231440889881924, "clip_ratio/low_min": 0.007231440889881924, "clip_ratio/region_mean": 0.011203637695871294, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 487.4375, "completions/mean_terminated_length": 487.4375, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 0.06051658000797033, "epoch": 1.1720000468800018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009627161780372262, "kl": 0.011055759328883141, "learning_rate": 7.462979999635128e-06, "loss": 0.0, "num_tokens": 4565374.0, "reward": 4.622981071472168, "reward_std": 7.745675563812256, "rewards/rollout_reward_func/mean": 4.622981071472168, "rewards/rollout_reward_func/std": 7.745676517486572, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.75, "sampling/sampling_logp_difference/mean": 0.4193398058414459, "step": 293, "step_time": 12.521304191999661 }, { "clip_ratio/high_max": 0.005478326667798683, "clip_ratio/high_mean": 0.005478326667798683, "clip_ratio/low_mean": 0.005602858276688494, "clip_ratio/low_min": 0.005602858276688494, "clip_ratio/region_mean": 0.011081184900831431, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 490.625, "completions/mean_terminated_length": 490.625, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "entropy": 0.057473327964544296, "epoch": 1.176000047040002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032781658228486776, "kl": 0.013267906499095261, "learning_rate": 7.4629799996322825e-06, "loss": 0.0, "num_tokens": 4586166.0, "reward": 0.8493175506591797, "reward_std": 8.474839210510254, "rewards/rollout_reward_func/mean": 0.8493175506591797, "rewards/rollout_reward_func/std": 8.474839210510254, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.0, "sampling/sampling_logp_difference/mean": 0.4253947138786316, "step": 294, "step_time": 12.70020199901046 }, { "clip_ratio/high_max": 0.006256476801354438, "clip_ratio/high_mean": 0.006256476801354438, "clip_ratio/low_mean": 0.006297188840107992, "clip_ratio/low_min": 0.006297188840107992, "clip_ratio/region_mean": 0.012553665495943278, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 476.5625, "completions/mean_terminated_length": 476.5625, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.05877308966591954, "epoch": 1.1800000472000018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014043665723875165, "kl": 0.011000591504853219, "learning_rate": 7.462979999629428e-06, "loss": 0.0, "num_tokens": 4606678.0, "reward": 1.7957574129104614, "reward_std": 9.623724937438965, "rewards/rollout_reward_func/mean": 1.7957574129104614, "rewards/rollout_reward_func/std": 9.623725891113281, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.687538146972656, "sampling/sampling_logp_difference/mean": 0.44337165355682373, "step": 295, "step_time": 12.56301343398809 }, { "clip_ratio/high_max": 0.006753956986358389, "clip_ratio/high_mean": 0.006753956986358389, "clip_ratio/low_mean": 0.0051958507392555475, "clip_ratio/low_min": 0.0051958507392555475, "clip_ratio/region_mean": 0.011949807871133089, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 492.0625, "completions/mean_terminated_length": 492.0625, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "entropy": 0.060645511373877525, "epoch": 1.1840000473600019e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011390440631657839, "kl": 0.012034592044074088, "learning_rate": 7.46297999962656e-06, "loss": 0.0, "num_tokens": 4627474.0, "reward": -0.8119000196456909, "reward_std": 6.145676136016846, "rewards/rollout_reward_func/mean": -0.8119000196456909, "rewards/rollout_reward_func/std": 6.145676136016846, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.6875, "sampling/sampling_logp_difference/mean": 0.43205440044403076, "step": 296, "step_time": 12.623401477991138 }, { "clip_ratio/high_max": 0.004132662375923246, "clip_ratio/high_mean": 0.004132662375923246, "clip_ratio/low_mean": 0.005044615856604651, "clip_ratio/low_min": 0.005044615856604651, "clip_ratio/region_mean": 0.009177278378047049, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 489.8125, "completions/mean_terminated_length": 489.8125, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.057854637037962675, "epoch": 1.1880000475200018e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.001472477917559445, "kl": 0.01055531029123813, "learning_rate": 7.462979999623682e-06, "loss": 0.0, "num_tokens": 4648238.0, "reward": 3.0155506134033203, "reward_std": 9.662580490112305, "rewards/rollout_reward_func/mean": 3.0155506134033203, "rewards/rollout_reward_func/std": 9.662580490112305, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.6875, "sampling/sampling_logp_difference/mean": 0.41327741742134094, "step": 297, "step_time": 12.594681071997911 }, { "clip_ratio/high_max": 0.005709123855922371, "clip_ratio/high_mean": 0.005709123855922371, "clip_ratio/low_mean": 0.0066131987841799855, "clip_ratio/low_min": 0.0066131987841799855, "clip_ratio/region_mean": 0.012322322581894696, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 461.8125, "completions/mean_terminated_length": 461.8125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 0.06258537899702787, "epoch": 1.1920000476800019e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012755371863022447, "kl": 0.009249743598047644, "learning_rate": 7.4629799996207925e-06, "loss": 0.0, "num_tokens": 4668503.0, "reward": 1.6417794227600098, "reward_std": 6.1886796951293945, "rewards/rollout_reward_func/mean": 1.6417794227600098, "rewards/rollout_reward_func/std": 6.1886796951293945, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.375, "sampling/sampling_logp_difference/mean": 0.4400327205657959, "step": 298, "step_time": 12.210946166014764 }, { "clip_ratio/high_max": 0.007359763199929148, "clip_ratio/high_mean": 0.007359763199929148, "clip_ratio/low_mean": 0.004303418623749167, "clip_ratio/low_min": 0.004303418623749167, "clip_ratio/region_mean": 0.011663181707262993, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 476.4375, "completions/mean_terminated_length": 476.4375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.06203242624178529, "epoch": 1.196000047840002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013819544110447168, "kl": 0.01076879835454747, "learning_rate": 7.462979999617892e-06, "loss": 0.0, "num_tokens": 4689019.0, "reward": 0.252280592918396, "reward_std": 4.116568088531494, "rewards/rollout_reward_func/mean": 0.252280592918396, "rewards/rollout_reward_func/std": 4.116568088531494, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.9609375, "sampling/sampling_logp_difference/mean": 0.45262661576271057, "step": 299, "step_time": 12.463808132008126 }, { "clip_ratio/high_max": 0.007174602593295276, "clip_ratio/high_mean": 0.007174602593295276, "clip_ratio/low_mean": 0.004320813692174852, "clip_ratio/low_min": 0.004320813692174852, "clip_ratio/region_mean": 0.011495416285470128, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 473.0, "completions/mean_terminated_length": 473.0, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.055134132504463196, "epoch": 1.2000000480000019e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011236731661483645, "kl": 0.009065479098353535, "learning_rate": 7.462979999614981e-06, "loss": 0.0, "num_tokens": 4709475.0, "reward": 1.8898600339889526, "reward_std": 4.56050968170166, "rewards/rollout_reward_func/mean": 1.8898600339889526, "rewards/rollout_reward_func/std": 4.56050968170166, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.48438262939453, "sampling/sampling_logp_difference/mean": 0.4465251863002777, "step": 300, "step_time": 12.550025395015837 }, { "clip_ratio/high_max": 0.0062248659087345, "clip_ratio/high_mean": 0.0062248659087345, "clip_ratio/low_mean": 0.007447682088240981, "clip_ratio/low_min": 0.007447682088240981, "clip_ratio/region_mean": 0.01367254788056016, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 467.0, "completions/mean_terminated_length": 467.0, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 0.06147481268271804, "epoch": 1.204000048160002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.000951342866756022, "kl": 0.007852587761590257, "learning_rate": 7.462979999612059e-06, "loss": 0.0, "num_tokens": 4729816.0, "reward": 5.597659111022949, "reward_std": 9.760183334350586, "rewards/rollout_reward_func/mean": 5.597659111022949, "rewards/rollout_reward_func/std": 9.760183334350586, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.875, "sampling/sampling_logp_difference/mean": 0.45297011733055115, "step": 301, "step_time": 12.456408048987214 }, { "clip_ratio/high_max": 0.0052884569158777595, "clip_ratio/high_mean": 0.0052884569158777595, "clip_ratio/low_mean": 0.005977407476166263, "clip_ratio/low_min": 0.005977407476166263, "clip_ratio/region_mean": 0.011265864362940192, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 470.25, "completions/mean_terminated_length": 470.25, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "entropy": 0.06219326797872782, "epoch": 1.2080000483200019e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014575911918655038, "kl": 0.011847823509015143, "learning_rate": 7.4629799996091255e-06, "loss": 0.0, "num_tokens": 4750225.0, "reward": -2.0577099323272705, "reward_std": 10.905440330505371, "rewards/rollout_reward_func/mean": -2.0577099323272705, "rewards/rollout_reward_func/std": 10.905441284179688, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.4375, "sampling/sampling_logp_difference/mean": 0.45705294609069824, "step": 302, "step_time": 12.660937999018643 }, { "clip_ratio/high_max": 0.004063633881742135, "clip_ratio/high_mean": 0.004063633881742135, "clip_ratio/low_mean": 0.005774876743089408, "clip_ratio/low_min": 0.005774876743089408, "clip_ratio/region_mean": 0.009838510653935373, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 476.5625, "completions/mean_terminated_length": 476.5625, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "entropy": 0.06072111940011382, "epoch": 1.212000048480002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0008723388309590518, "kl": 0.009600896504707634, "learning_rate": 7.462979999606182e-06, "loss": 0.0, "num_tokens": 4770740.0, "reward": 2.13948917388916, "reward_std": 7.89893102645874, "rewards/rollout_reward_func/mean": 2.13948917388916, "rewards/rollout_reward_func/std": 7.89893102645874, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.171878814697266, "sampling/sampling_logp_difference/mean": 0.45034223794937134, "step": 303, "step_time": 12.379606041009538 }, { "clip_ratio/high_max": 0.005527417873963714, "clip_ratio/high_mean": 0.005527417873963714, "clip_ratio/low_mean": 0.004871475248364732, "clip_ratio/low_min": 0.004871475248364732, "clip_ratio/region_mean": 0.010398893267847598, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 486.3125, "completions/mean_terminated_length": 486.3125, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "entropy": 0.060428479220718145, "epoch": 1.2160000486400019e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0006201016949489713, "kl": 0.012834622408263385, "learning_rate": 7.462979999603225e-06, "loss": 0.0, "num_tokens": 4791434.0, "reward": 2.846992015838623, "reward_std": 10.08895206451416, "rewards/rollout_reward_func/mean": 2.846992015838623, "rewards/rollout_reward_func/std": 10.08895206451416, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.6875, "sampling/sampling_logp_difference/mean": 0.4526660740375519, "step": 304, "step_time": 12.516566964994126 }, { "clip_ratio/high_max": 0.004700978082837537, "clip_ratio/high_mean": 0.004700978082837537, "clip_ratio/low_mean": 0.005794972734292969, "clip_ratio/low_min": 0.005794972734292969, "clip_ratio/region_mean": 0.010495950758922845, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 461.1875, "completions/mean_terminated_length": 461.1875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.05919934203848243, "epoch": 1.220000048800002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010927891125902534, "kl": 0.007305314997211099, "learning_rate": 7.462979999600259e-06, "loss": 0.0, "num_tokens": 4811701.0, "reward": 6.863140106201172, "reward_std": 25.174884796142578, "rewards/rollout_reward_func/mean": 6.863140106201172, "rewards/rollout_reward_func/std": 25.17488670349121, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.1875, "sampling/sampling_logp_difference/mean": 0.4547431766986847, "step": 305, "step_time": 12.452665372999036 }, { "clip_ratio/high_max": 0.005410962330643088, "clip_ratio/high_mean": 0.005410962330643088, "clip_ratio/low_mean": 0.0057005729759112, "clip_ratio/low_min": 0.0057005729759112, "clip_ratio/region_mean": 0.01111153542296961, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 478.0625, "completions/mean_terminated_length": 478.0625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.05708180787041783, "epoch": 1.2240000489600019e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010784674668684602, "kl": 0.009756926854606718, "learning_rate": 7.462979999597281e-06, "loss": 0.0, "num_tokens": 4832280.0, "reward": 5.923610210418701, "reward_std": 25.11573028564453, "rewards/rollout_reward_func/mean": 5.923610210418701, "rewards/rollout_reward_func/std": 25.115732192993164, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.00000762939453, "sampling/sampling_logp_difference/mean": 0.43616026639938354, "step": 306, "step_time": 12.372899060981581 }, { "clip_ratio/high_max": 0.0065210177563130856, "clip_ratio/high_mean": 0.0065210177563130856, "clip_ratio/low_mean": 0.005876491311937571, "clip_ratio/low_min": 0.005876491311937571, "clip_ratio/region_mean": 0.012397508951835334, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 479.625, "completions/mean_terminated_length": 479.625, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "entropy": 0.06256403448060155, "epoch": 1.228000049120002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002325798384845257, "kl": 0.008326490933541209, "learning_rate": 7.462979999594292e-06, "loss": 0.0, "num_tokens": 4852855.0, "reward": 1.1561849117279053, "reward_std": 6.758068561553955, "rewards/rollout_reward_func/mean": 1.1561849117279053, "rewards/rollout_reward_func/std": 6.758068561553955, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 31.12500762939453, "sampling/sampling_logp_difference/mean": 0.4188748896121979, "step": 307, "step_time": 12.44288911199692 }, { "clip_ratio/high_max": 0.0066905420972034335, "clip_ratio/high_mean": 0.0066905420972034335, "clip_ratio/low_mean": 0.006002770940540358, "clip_ratio/low_min": 0.006002770940540358, "clip_ratio/region_mean": 0.012693313066847622, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 487.6875, "completions/mean_terminated_length": 487.6875, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "entropy": 0.05627503711730242, "epoch": 1.232000049280002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.001223902334459126, "kl": 0.011558682599570602, "learning_rate": 7.462979999591293e-06, "loss": 0.0, "num_tokens": 4873577.0, "reward": 2.756261110305786, "reward_std": 11.454345703125, "rewards/rollout_reward_func/mean": 2.756261110305786, "rewards/rollout_reward_func/std": 11.454345703125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.875, "sampling/sampling_logp_difference/mean": 0.4371177554130554, "step": 308, "step_time": 12.59396385999571 }, { "clip_ratio/high_max": 0.005678598070517182, "clip_ratio/high_mean": 0.005678598070517182, "clip_ratio/low_mean": 0.0061370857583824545, "clip_ratio/low_min": 0.0061370857583824545, "clip_ratio/region_mean": 0.011815683799795806, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 487.0, "completions/mean_terminated_length": 487.0, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "entropy": 0.056851573288440704, "epoch": 1.236000049440002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011849136790260673, "kl": 0.009509575669653714, "learning_rate": 7.4629799995882825e-06, "loss": 0.0, "num_tokens": 4894309.0, "reward": 3.668288230895996, "reward_std": 7.2263503074646, "rewards/rollout_reward_func/mean": 3.668288230895996, "rewards/rollout_reward_func/std": 7.226350784301758, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.3125, "sampling/sampling_logp_difference/mean": 0.4263158440589905, "step": 309, "step_time": 12.44038709600136 }, { "clip_ratio/high_max": 0.007815445307642221, "clip_ratio/high_mean": 0.007815445307642221, "clip_ratio/low_mean": 0.0057155193062499166, "clip_ratio/low_min": 0.0057155193062499166, "clip_ratio/region_mean": 0.01353096473030746, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 457.3125, "completions/mean_terminated_length": 457.3125, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.0625912044197321, "epoch": 1.240000049600002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0012316321954131126, "kl": 0.007488197472412139, "learning_rate": 7.46297999958526e-06, "loss": 0.0, "num_tokens": 4914474.0, "reward": 6.51238489151001, "reward_std": 9.637356758117676, "rewards/rollout_reward_func/mean": 6.51238489151001, "rewards/rollout_reward_func/std": 9.637357711791992, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.75, "sampling/sampling_logp_difference/mean": 0.4291302263736725, "step": 310, "step_time": 12.175220567994984 }, { "clip_ratio/high_max": 0.006625733163673431, "clip_ratio/high_mean": 0.006625733163673431, "clip_ratio/low_mean": 0.0061888374330010265, "clip_ratio/low_min": 0.0061888374330010265, "clip_ratio/region_mean": 0.012814570509362966, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 463.625, "completions/mean_terminated_length": 463.625, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.062343116383999586, "epoch": 1.244000049760002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011582565493881702, "kl": 0.008820511109661311, "learning_rate": 7.462979999582228e-06, "loss": 0.0, "num_tokens": 4934765.0, "reward": 5.874722480773926, "reward_std": 9.614212989807129, "rewards/rollout_reward_func/mean": 5.874722480773926, "rewards/rollout_reward_func/std": 9.614212989807129, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.8125, "sampling/sampling_logp_difference/mean": 0.4476583003997803, "step": 311, "step_time": 12.40863186999195 }, { "clip_ratio/high_max": 0.005343334632925689, "clip_ratio/high_mean": 0.005343334632925689, "clip_ratio/low_mean": 0.006588078278582543, "clip_ratio/low_min": 0.006588078278582543, "clip_ratio/region_mean": 0.011931412853300571, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 462.625, "completions/mean_terminated_length": 462.625, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.05799495615065098, "epoch": 1.248000049920002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002227329881861806, "kl": 0.011002468643710017, "learning_rate": 7.462979999579184e-06, "loss": 0.0, "num_tokens": 4955056.0, "reward": 7.257813453674316, "reward_std": 19.653560638427734, "rewards/rollout_reward_func/mean": 7.257813453674316, "rewards/rollout_reward_func/std": 19.653562545776367, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.59394454956055, "sampling/sampling_logp_difference/mean": 0.43897745013237, "step": 312, "step_time": 12.400978846002545 }, { "clip_ratio/high_max": 0.0047473984595853835, "clip_ratio/high_mean": 0.0047473984595853835, "clip_ratio/low_mean": 0.004583204019581899, "clip_ratio/low_min": 0.004583204019581899, "clip_ratio/region_mean": 0.009330602537374943, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 475.75, "completions/mean_terminated_length": 475.75, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.05976028647273779, "epoch": 1.252000050080002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002983210841193795, "kl": 0.008457929827272892, "learning_rate": 7.46297999957613e-06, "loss": 0.0, "num_tokens": 4975576.0, "reward": -0.27865803241729736, "reward_std": 4.826484680175781, "rewards/rollout_reward_func/mean": -0.27865803241729736, "rewards/rollout_reward_func/std": 4.8264851570129395, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.5625, "sampling/sampling_logp_difference/mean": 0.4324151575565338, "step": 313, "step_time": 12.383697896999365 }, { "clip_ratio/high_max": 0.0071051898412406445, "clip_ratio/high_mean": 0.0071051898412406445, "clip_ratio/low_mean": 0.006084051099605858, "clip_ratio/low_min": 0.006084051099605858, "clip_ratio/region_mean": 0.013189240824431181, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 451.125, "completions/mean_terminated_length": 451.125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.05994137143716216, "epoch": 1.256000050240002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010654724901542068, "kl": 0.009673936641775072, "learning_rate": 7.462979999573063e-06, "loss": 0.0, "num_tokens": 4995657.0, "reward": 11.099104881286621, "reward_std": 23.666709899902344, "rewards/rollout_reward_func/mean": 11.099104881286621, "rewards/rollout_reward_func/std": 23.666709899902344, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.8125, "sampling/sampling_logp_difference/mean": 0.4479699432849884, "step": 314, "step_time": 12.21918781902059 }, { "clip_ratio/high_max": 0.004658553923945874, "clip_ratio/high_mean": 0.004658553923945874, "clip_ratio/low_mean": 0.006403300212696195, "clip_ratio/low_min": 0.006403300212696195, "clip_ratio/region_mean": 0.011061854078434408, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 485.0625, "completions/mean_terminated_length": 485.0625, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 0.058969973120838404, "epoch": 1.2600000504000021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010270103812217712, "kl": 0.007417409331537783, "learning_rate": 7.462979999569986e-06, "loss": 0.0, "num_tokens": 5016346.0, "reward": 8.621185302734375, "reward_std": 13.015069961547852, "rewards/rollout_reward_func/mean": 8.621185302734375, "rewards/rollout_reward_func/std": 13.015069007873535, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.125, "sampling/sampling_logp_difference/mean": 0.43380922079086304, "step": 315, "step_time": 12.434123437997187 }, { "clip_ratio/high_max": 0.004000634333351627, "clip_ratio/high_mean": 0.004000634333351627, "clip_ratio/low_mean": 0.006353482662234455, "clip_ratio/low_min": 0.006353482662234455, "clip_ratio/region_mean": 0.010354116966482252, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 500.75, "completions/mean_terminated_length": 500.75, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "entropy": 0.05718414345756173, "epoch": 1.264000050560002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017133485525846481, "kl": 0.012295163818635046, "learning_rate": 7.462979999566898e-06, "loss": 0.0, "num_tokens": 5037318.0, "reward": -0.5189194083213806, "reward_std": 9.180360794067383, "rewards/rollout_reward_func/mean": -0.5189194083213806, "rewards/rollout_reward_func/std": 9.180360794067383, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.25, "sampling/sampling_logp_difference/mean": 0.40297237038612366, "step": 316, "step_time": 12.56675771099981 }, { "clip_ratio/high_max": 0.007722510257735848, "clip_ratio/high_mean": 0.007722510257735848, "clip_ratio/low_mean": 0.004911288502626121, "clip_ratio/low_min": 0.004911288502626121, "clip_ratio/region_mean": 0.012633798643946648, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 478.3125, "completions/mean_terminated_length": 478.3125, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.061181637458503246, "epoch": 1.2680000507200021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0007944426615722477, "kl": 0.01110104299732484, "learning_rate": 7.462979999563799e-06, "loss": 0.0, "num_tokens": 5057867.0, "reward": 1.9031760692596436, "reward_std": 8.116804122924805, "rewards/rollout_reward_func/mean": 1.9031760692596436, "rewards/rollout_reward_func/std": 8.116804122924805, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.0625, "sampling/sampling_logp_difference/mean": 0.4437119960784912, "step": 317, "step_time": 12.42425468199508 }, { "clip_ratio/high_max": 0.0051042347913607955, "clip_ratio/high_mean": 0.0051042347913607955, "clip_ratio/low_mean": 0.007260087091708556, "clip_ratio/low_min": 0.007260087091708556, "clip_ratio/region_mean": 0.01236432185396552, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 477.875, "completions/mean_terminated_length": 477.875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.061070284340530634, "epoch": 1.272000050880002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024364651180803776, "kl": 0.01008367573376745, "learning_rate": 7.462979999560689e-06, "loss": 0.0, "num_tokens": 5078444.0, "reward": 2.123398542404175, "reward_std": 25.817649841308594, "rewards/rollout_reward_func/mean": 2.123398542404175, "rewards/rollout_reward_func/std": 25.817651748657227, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.125, "sampling/sampling_logp_difference/mean": 0.43722957372665405, "step": 318, "step_time": 12.444761751001352 }, { "clip_ratio/high_max": 0.004506497178226709, "clip_ratio/high_mean": 0.004506497178226709, "clip_ratio/low_mean": 0.007385977893136442, "clip_ratio/low_min": 0.007385977893136442, "clip_ratio/region_mean": 0.011892475071363151, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 488.8125, "completions/mean_terminated_length": 488.8125, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "entropy": 0.05940265487879515, "epoch": 1.276000051040002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0013490350684151053, "kl": 0.011500844091642648, "learning_rate": 7.462979999557567e-06, "loss": 0.0, "num_tokens": 5099195.0, "reward": 4.353598117828369, "reward_std": 10.671649932861328, "rewards/rollout_reward_func/mean": 4.353598117828369, "rewards/rollout_reward_func/std": 10.671651840209961, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.25, "sampling/sampling_logp_difference/mean": 0.44270044565200806, "step": 319, "step_time": 12.50141818299744 }, { "clip_ratio/high_max": 0.0055155224981717765, "clip_ratio/high_mean": 0.0055155224981717765, "clip_ratio/low_mean": 0.006579984095878899, "clip_ratio/low_min": 0.006579984095878899, "clip_ratio/region_mean": 0.012095506477635354, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 465.1875, "completions/mean_terminated_length": 465.1875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.05806524585932493, "epoch": 1.280000051200002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009684296674095094, "kl": 0.007302577781956643, "learning_rate": 7.462979999554436e-06, "loss": 0.0, "num_tokens": 5119510.0, "reward": 3.7468695640563965, "reward_std": 8.24494743347168, "rewards/rollout_reward_func/mean": 3.7468695640563965, "rewards/rollout_reward_func/std": 8.24494743347168, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.6875, "sampling/sampling_logp_difference/mean": 0.44716671109199524, "step": 320, "step_time": 12.340300815012597 }, { "clip_ratio/high_max": 0.008930810377933085, "clip_ratio/high_mean": 0.008930810377933085, "clip_ratio/low_mean": 0.0037604482495225966, "clip_ratio/low_min": 0.0037604482495225966, "clip_ratio/region_mean": 0.012691258685663342, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 777.125, "completions/mean_terminated_length": 777.125, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "entropy": 0.04953635297715664, "epoch": 1.284000051360002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 23.686630249023438, "kl": 8.191130811988842, "learning_rate": 7.462979999551293e-06, "loss": 0.0315, "num_tokens": 5144845.0, "reward": -5.672852039337158, "reward_std": 10.382848739624023, "rewards/rollout_reward_func/mean": -5.672852039337158, "rewards/rollout_reward_func/std": 10.38284969329834, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.390625, "sampling/sampling_logp_difference/mean": 0.3745320439338684, "step": 321, "step_time": 15.768591948988615 }, { "clip_ratio/high_max": 0.006009313452523202, "clip_ratio/high_mean": 0.006009313452523202, "clip_ratio/low_mean": 0.005917307047639042, "clip_ratio/low_min": 0.005917307047639042, "clip_ratio/region_mean": 0.011926620500162244, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 719.875, "completions/mean_terminated_length": 719.875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.050230386201292276, "epoch": 1.288000051520002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003032830310985446, "kl": 0.006993115937802941, "learning_rate": 7.462979999548138e-06, "loss": 0.0, "num_tokens": 5169244.0, "reward": 0.7719447612762451, "reward_std": 25.466087341308594, "rewards/rollout_reward_func/mean": 0.7719447612762451, "rewards/rollout_reward_func/std": 25.466089248657227, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.98445510864258, "sampling/sampling_logp_difference/mean": 0.37494802474975586, "step": 322, "step_time": 19.616102453990607 }, { "clip_ratio/high_max": 0.006533676292747259, "clip_ratio/high_mean": 0.006533676292747259, "clip_ratio/low_mean": 0.005022172117605805, "clip_ratio/low_min": 0.005022172117605805, "clip_ratio/region_mean": 0.011555848410353065, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 768.8125, "completions/mean_terminated_length": 768.8125, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "entropy": 0.049626300111413, "epoch": 1.292000051680002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002418217249214649, "kl": 0.009957067668437958, "learning_rate": 7.462979999544972e-06, "loss": 0.0, "num_tokens": 5194457.0, "reward": 0.8500571250915527, "reward_std": 12.989143371582031, "rewards/rollout_reward_func/mean": 0.8500571250915527, "rewards/rollout_reward_func/std": 12.989143371582031, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.203125, "sampling/sampling_logp_difference/mean": 0.35323700308799744, "step": 323, "step_time": 15.689450544996362 }, { "clip_ratio/high_max": 0.003891672851750627, "clip_ratio/high_mean": 0.003891672851750627, "clip_ratio/low_mean": 0.005592844448983669, "clip_ratio/low_min": 0.005592844448983669, "clip_ratio/region_mean": 0.009484517271630466, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 802.375, "completions/mean_terminated_length": 802.375, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "entropy": 0.047686658799648285, "epoch": 1.2960000518400021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01907181739807129, "kl": 0.016789826680906117, "learning_rate": 7.462979999541796e-06, "loss": 0.0001, "num_tokens": 5220253.0, "reward": -2.106893539428711, "reward_std": 5.394664764404297, "rewards/rollout_reward_func/mean": -2.106893539428711, "rewards/rollout_reward_func/std": 5.394664764404297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 35.875, "sampling/sampling_logp_difference/mean": 0.338699072599411, "step": 324, "step_time": 15.547471336001763 }, { "clip_ratio/high_max": 0.006058851897250861, "clip_ratio/high_mean": 0.006058851897250861, "clip_ratio/low_mean": 0.005657858186168596, "clip_ratio/low_min": 0.005657858186168596, "clip_ratio/region_mean": 0.011716710054315627, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 741.1875, "completions/mean_terminated_length": 741.1875, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "entropy": 0.051305538043379784, "epoch": 1.300000052000002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033551985397934914, "kl": 0.013529515592381358, "learning_rate": 7.4629799995386074e-06, "loss": 0.0, "num_tokens": 5244976.0, "reward": -7.604296684265137, "reward_std": 9.542521476745605, "rewards/rollout_reward_func/mean": -7.604296684265137, "rewards/rollout_reward_func/std": 9.542521476745605, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.875, "sampling/sampling_logp_difference/mean": 0.3602525591850281, "step": 325, "step_time": 15.342742891982198 }, { "clip_ratio/high_max": 0.00829805654939264, "clip_ratio/high_mean": 0.00829805654939264, "clip_ratio/low_mean": 0.0031420258164871484, "clip_ratio/low_min": 0.0031420258164871484, "clip_ratio/region_mean": 0.011440082336775959, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 788.1875, "completions/mean_terminated_length": 788.1875, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "entropy": 0.04901377111673355, "epoch": 1.3040000521600021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029735523276031017, "kl": 0.013596097938716412, "learning_rate": 7.46297999953541e-06, "loss": 0.0001, "num_tokens": 5270513.0, "reward": -6.7111711502075195, "reward_std": 8.764341354370117, "rewards/rollout_reward_func/mean": -6.7111711502075195, "rewards/rollout_reward_func/std": 8.764341354370117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.97265625, "sampling/sampling_logp_difference/mean": 0.3501136898994446, "step": 326, "step_time": 15.710848062990408 }, { "clip_ratio/high_max": 0.005466414091642946, "clip_ratio/high_mean": 0.005466414091642946, "clip_ratio/low_mean": 0.004213039937894791, "clip_ratio/low_min": 0.004213039937894791, "clip_ratio/region_mean": 0.009679453971330076, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 753.6875, "completions/mean_terminated_length": 753.6875, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "entropy": 0.05162303056567907, "epoch": 1.308000052320002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003461049869656563, "kl": 0.015329333255067468, "learning_rate": 7.4629799995322e-06, "loss": 0.0001, "num_tokens": 5295458.0, "reward": -4.9077253341674805, "reward_std": 10.749165534973145, "rewards/rollout_reward_func/mean": -4.9077253341674805, "rewards/rollout_reward_func/std": 10.749166488647461, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.939697265625, "sampling/sampling_logp_difference/mean": 0.35634660720825195, "step": 327, "step_time": 15.506193765999342 }, { "clip_ratio/high_max": 0.005247855879133567, "clip_ratio/high_mean": 0.005247855879133567, "clip_ratio/low_mean": 0.004903109453152865, "clip_ratio/low_min": 0.004903109453152865, "clip_ratio/region_mean": 0.010150965303182602, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 761.75, "completions/mean_terminated_length": 761.75, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "entropy": 0.04850701382383704, "epoch": 1.3120000524800021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004949595313519239, "kl": 0.018330377293750644, "learning_rate": 7.46297999952898e-06, "loss": 0.0001, "num_tokens": 5320541.0, "reward": -5.165726661682129, "reward_std": 7.2934699058532715, "rewards/rollout_reward_func/mean": -5.165726661682129, "rewards/rollout_reward_func/std": 7.2934699058532715, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.3125, "sampling/sampling_logp_difference/mean": 0.3616620898246765, "step": 328, "step_time": 15.708304815008887 }, { "clip_ratio/high_max": 0.004447560932021588, "clip_ratio/high_mean": 0.004447560932021588, "clip_ratio/low_mean": 0.007469348143786192, "clip_ratio/low_min": 0.007469348143786192, "clip_ratio/region_mean": 0.011916909017600119, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 711.375, "completions/mean_terminated_length": 711.375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.0519402283243835, "epoch": 1.316000052640002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038671367801725864, "kl": 0.019517106586135924, "learning_rate": 7.462979999525748e-06, "loss": 0.0001, "num_tokens": 5344785.0, "reward": 5.669243335723877, "reward_std": 26.8841552734375, "rewards/rollout_reward_func/mean": 5.669243335723877, "rewards/rollout_reward_func/std": 26.884159088134766, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.59375, "sampling/sampling_logp_difference/mean": 0.37179097533226013, "step": 329, "step_time": 15.234093330996984 }, { "clip_ratio/high_max": 0.00442786916391924, "clip_ratio/high_mean": 0.00442786916391924, "clip_ratio/low_mean": 0.0057207338977605104, "clip_ratio/low_min": 0.0057207338977605104, "clip_ratio/region_mean": 0.01014860306167975, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 752.5, "completions/mean_terminated_length": 752.5, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "entropy": 0.05224268930032849, "epoch": 1.3200000528000021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036050393246114254, "kl": 0.015054374583996832, "learning_rate": 7.4629799995225045e-06, "loss": 0.0001, "num_tokens": 5369719.0, "reward": -1.6511424779891968, "reward_std": 8.066100120544434, "rewards/rollout_reward_func/mean": -1.6511424779891968, "rewards/rollout_reward_func/std": 8.06610107421875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.6875, "sampling/sampling_logp_difference/mean": 0.35447072982788086, "step": 330, "step_time": 15.486407771000813 }, { "clip_ratio/high_max": 0.005434655409771949, "clip_ratio/high_mean": 0.005434655409771949, "clip_ratio/low_mean": 0.004870714474236593, "clip_ratio/low_min": 0.004870714474236593, "clip_ratio/region_mean": 0.01030536973848939, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 782.6875, "completions/mean_terminated_length": 782.6875, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "entropy": 0.050912002101540565, "epoch": 1.324000052960002e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.13599710166454315, "kl": 0.08559688448440284, "learning_rate": 7.462979999519251e-06, "loss": 0.0003, "num_tokens": 5395176.0, "reward": -2.8190231323242188, "reward_std": 11.925765991210938, "rewards/rollout_reward_func/mean": -2.8190231323242188, "rewards/rollout_reward_func/std": 11.925766944885254, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.75, "sampling/sampling_logp_difference/mean": 0.3406500220298767, "step": 331, "step_time": 15.502287302013428 }, { "clip_ratio/high_max": 0.005756538361310959, "clip_ratio/high_mean": 0.005756538361310959, "clip_ratio/low_mean": 0.004794673121068627, "clip_ratio/low_min": 0.004794673121068627, "clip_ratio/region_mean": 0.010551211424171925, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 760.5, "completions/mean_terminated_length": 760.5, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "entropy": 0.051622338593006134, "epoch": 1.3280000531200021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006187882740050554, "kl": 0.02246831450611353, "learning_rate": 7.4629799995159866e-06, "loss": 0.0001, "num_tokens": 5420235.0, "reward": -3.703737497329712, "reward_std": 11.289987564086914, "rewards/rollout_reward_func/mean": -3.703737497329712, "rewards/rollout_reward_func/std": 11.28998851776123, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.5, "sampling/sampling_logp_difference/mean": 0.3426687717437744, "step": 332, "step_time": 15.406420558989339 }, { "clip_ratio/high_max": 0.004389902402181178, "clip_ratio/high_mean": 0.004389902402181178, "clip_ratio/low_mean": 0.007090635830536485, "clip_ratio/low_min": 0.007090635830536485, "clip_ratio/region_mean": 0.011480538174510002, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 730.3125, "completions/mean_terminated_length": 730.3125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.05081031424924731, "epoch": 1.3320000532800022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005006625782698393, "kl": 0.017255843500606716, "learning_rate": 7.46297999951271e-06, "loss": 0.0001, "num_tokens": 5444808.0, "reward": 4.8913044929504395, "reward_std": 24.198928833007812, "rewards/rollout_reward_func/mean": 4.8913044929504395, "rewards/rollout_reward_func/std": 24.198930740356445, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.078125, "sampling/sampling_logp_difference/mean": 0.35198214650154114, "step": 333, "step_time": 15.51814464299241 }, { "clip_ratio/high_max": 0.004944100190186873, "clip_ratio/high_mean": 0.004944100190186873, "clip_ratio/low_mean": 0.003828115848591551, "clip_ratio/low_min": 0.003828115848591551, "clip_ratio/region_mean": 0.008772216038778424, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 768.625, "completions/mean_terminated_length": 768.625, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "entropy": 0.05095800943672657, "epoch": 1.3360000534400021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035330564714968204, "kl": 0.01711257128044963, "learning_rate": 7.462979999509425e-06, "loss": 0.0001, "num_tokens": 5470013.0, "reward": -1.5287352800369263, "reward_std": 9.465808868408203, "rewards/rollout_reward_func/mean": -1.5287352800369263, "rewards/rollout_reward_func/std": 9.465808868408203, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.921875, "sampling/sampling_logp_difference/mean": 0.35106101632118225, "step": 334, "step_time": 15.63310942199314 }, { "clip_ratio/high_max": 0.005306420207489282, "clip_ratio/high_mean": 0.005306420207489282, "clip_ratio/low_mean": 0.0050155782664660364, "clip_ratio/low_min": 0.0050155782664660364, "clip_ratio/region_mean": 0.010321998503059149, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 774.625, "completions/mean_terminated_length": 774.625, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "entropy": 0.05094103189185262, "epoch": 1.3400000536000022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027216956950724125, "kl": 0.013509329175576568, "learning_rate": 7.462979999506125e-06, "loss": 0.0001, "num_tokens": 5495330.0, "reward": -6.549553871154785, "reward_std": 7.10978364944458, "rewards/rollout_reward_func/mean": -6.549553871154785, "rewards/rollout_reward_func/std": 7.10978364944458, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.6484375, "sampling/sampling_logp_difference/mean": 0.3368455767631531, "step": 335, "step_time": 15.693685518985149 }, { "clip_ratio/high_max": 0.0052575234294636175, "clip_ratio/high_mean": 0.0052575234294636175, "clip_ratio/low_mean": 0.003976857566158287, "clip_ratio/low_min": 0.003976857566158287, "clip_ratio/region_mean": 0.009234380966518074, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 793.0, "completions/mean_terminated_length": 793.0, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "entropy": 0.048654396552592516, "epoch": 1.3440000537600021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005316853057593107, "kl": 0.01842357497662306, "learning_rate": 7.462979999502818e-06, "loss": 0.0001, "num_tokens": 5520959.0, "reward": -6.195209503173828, "reward_std": 6.895548343658447, "rewards/rollout_reward_func/mean": -6.195209503173828, "rewards/rollout_reward_func/std": 6.895548343658447, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.591796875, "sampling/sampling_logp_difference/mean": 0.34062761068344116, "step": 336, "step_time": 15.718944132000615 }, { "clip_ratio/high_max": 0.00260583299677819, "clip_ratio/high_mean": 0.00260583299677819, "clip_ratio/low_mean": 0.005530011956579983, "clip_ratio/low_min": 0.005530011956579983, "clip_ratio/region_mean": 0.008135845069773495, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 757.5, "completions/mean_terminated_length": 757.5, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "entropy": 0.05208770092576742, "epoch": 1.3480000539200022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033526085317134857, "kl": 0.015389803564175963, "learning_rate": 7.4629799994994965e-06, "loss": 0.0001, "num_tokens": 5546011.0, "reward": 6.927764892578125, "reward_std": 21.972455978393555, "rewards/rollout_reward_func/mean": 6.927764892578125, "rewards/rollout_reward_func/std": 21.972455978393555, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.9375, "sampling/sampling_logp_difference/mean": 0.3547701835632324, "step": 337, "step_time": 15.544280302987318 }, { "clip_ratio/high_max": 0.004606835806043819, "clip_ratio/high_mean": 0.004606835806043819, "clip_ratio/low_mean": 0.004008225514553487, "clip_ratio/low_min": 0.004008225514553487, "clip_ratio/region_mean": 0.008615061291493475, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 793.6875, "completions/mean_terminated_length": 793.6875, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "entropy": 0.04762946953997016, "epoch": 1.3520000540800021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005531526170670986, "kl": 0.020992812234908342, "learning_rate": 7.462979999496166e-06, "loss": 0.0001, "num_tokens": 5571636.0, "reward": -1.4824516773223877, "reward_std": 8.680548667907715, "rewards/rollout_reward_func/mean": -1.4824516773223877, "rewards/rollout_reward_func/std": 8.680549621582031, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.43750762939453, "sampling/sampling_logp_difference/mean": 0.3360477387905121, "step": 338, "step_time": 15.792682255996624 }, { "clip_ratio/high_max": 0.005491138203069568, "clip_ratio/high_mean": 0.005491138203069568, "clip_ratio/low_mean": 0.004464271274628118, "clip_ratio/low_min": 0.004464271274628118, "clip_ratio/region_mean": 0.009955409506801516, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 758.0625, "completions/mean_terminated_length": 758.0625, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "entropy": 0.054124871734529734, "epoch": 1.3560000542400022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006805215496569872, "kl": 0.020180948078632355, "learning_rate": 7.462979999492824e-06, "loss": 0.0001, "num_tokens": 5596661.0, "reward": -6.416113376617432, "reward_std": 7.257163047790527, "rewards/rollout_reward_func/mean": -6.416113376617432, "rewards/rollout_reward_func/std": 7.2571635246276855, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.2826042175293, "sampling/sampling_logp_difference/mean": 0.336163729429245, "step": 339, "step_time": 15.59547514100268 }, { "clip_ratio/high_max": 0.0037546000676229596, "clip_ratio/high_mean": 0.0037546000676229596, "clip_ratio/low_mean": 0.006290162680670619, "clip_ratio/low_min": 0.006290162680670619, "clip_ratio/region_mean": 0.010044762748293579, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 738.3125, "completions/mean_terminated_length": 738.3125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.049805560149252415, "epoch": 1.3600000544000023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002379302866756916, "kl": 0.01400084001943469, "learning_rate": 7.46297999948947e-06, "loss": 0.0001, "num_tokens": 5621397.0, "reward": 2.1813125610351562, "reward_std": 23.601055145263672, "rewards/rollout_reward_func/mean": 2.1813125610351562, "rewards/rollout_reward_func/std": 23.60105323791504, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.4375, "sampling/sampling_logp_difference/mean": 0.3498060405254364, "step": 340, "step_time": 15.64786771699437 }, { "clip_ratio/high_max": 0.005941967654507607, "clip_ratio/high_mean": 0.005941967654507607, "clip_ratio/low_mean": 0.004563594120554626, "clip_ratio/low_min": 0.004563594120554626, "clip_ratio/region_mean": 0.01050556160043925, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 759.5, "completions/mean_terminated_length": 759.5, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "entropy": 0.05036119185388088, "epoch": 1.3640000545600022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002590740565210581, "kl": 0.014205773710273206, "learning_rate": 7.462979999486106e-06, "loss": 0.0001, "num_tokens": 5646444.0, "reward": -3.6115376949310303, "reward_std": 8.51977825164795, "rewards/rollout_reward_func/mean": -3.6115376949310303, "rewards/rollout_reward_func/std": 8.51977825164795, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.40625, "sampling/sampling_logp_difference/mean": 0.3581894040107727, "step": 341, "step_time": 15.598767914008931 }, { "clip_ratio/high_max": 0.0046032647078391165, "clip_ratio/high_mean": 0.0046032647078391165, "clip_ratio/low_mean": 0.005511896568350494, "clip_ratio/low_min": 0.005511896568350494, "clip_ratio/region_mean": 0.010115161188878119, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 739.625, "completions/mean_terminated_length": 739.625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "entropy": 0.053417830262333155, "epoch": 1.3680000547200022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004101408179849386, "kl": 0.014897051965817809, "learning_rate": 7.4629799994827295e-06, "loss": 0.0001, "num_tokens": 5671191.0, "reward": 1.0644080638885498, "reward_std": 26.87002182006836, "rewards/rollout_reward_func/mean": 1.0644080638885498, "rewards/rollout_reward_func/std": 26.87002182006836, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.5, "sampling/sampling_logp_difference/mean": 0.3603905141353607, "step": 342, "step_time": 15.653176811996673 }, { "clip_ratio/high_max": 0.005048712890129536, "clip_ratio/high_mean": 0.005048712890129536, "clip_ratio/low_mean": 0.005408234370406717, "clip_ratio/low_min": 0.005408234370406717, "clip_ratio/region_mean": 0.010456947260536253, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 789.0, "completions/mean_terminated_length": 789.0, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "entropy": 0.050693805795162916, "epoch": 1.3720000548800022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00966982264071703, "kl": 0.01576916262274608, "learning_rate": 7.462979999479345e-06, "loss": 0.0001, "num_tokens": 5696746.0, "reward": -3.8082098960876465, "reward_std": 4.993642807006836, "rewards/rollout_reward_func/mean": -3.8082098960876465, "rewards/rollout_reward_func/std": 4.993642807006836, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.5625, "sampling/sampling_logp_difference/mean": 0.3531162738800049, "step": 343, "step_time": 15.790778624999803 }, { "clip_ratio/high_max": 0.004435281356563792, "clip_ratio/high_mean": 0.004435281356563792, "clip_ratio/low_mean": 0.005340958799934015, "clip_ratio/low_min": 0.005340958799934015, "clip_ratio/region_mean": 0.009776240098290145, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 742.75, "completions/mean_terminated_length": 742.75, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.0501544582657516, "epoch": 1.3760000550400022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004284893162548542, "kl": 0.015322697930969298, "learning_rate": 7.462979999475947e-06, "loss": 0.0001, "num_tokens": 5721544.0, "reward": 1.9447436332702637, "reward_std": 25.66971778869629, "rewards/rollout_reward_func/mean": 1.9447436332702637, "rewards/rollout_reward_func/std": 25.669719696044922, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.375, "sampling/sampling_logp_difference/mean": 0.3580959141254425, "step": 344, "step_time": 15.562539246995584 }, { "clip_ratio/high_max": 0.005862918857019395, "clip_ratio/high_mean": 0.005862918857019395, "clip_ratio/low_mean": 0.004916544537991285, "clip_ratio/low_min": 0.004916544537991285, "clip_ratio/region_mean": 0.010779463278595358, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 763.75, "completions/mean_terminated_length": 763.75, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "entropy": 0.046793310437351465, "epoch": 1.3800000552000021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002124689519405365, "kl": 0.011988642974756658, "learning_rate": 7.462979999472538e-06, "loss": 0.0, "num_tokens": 5746662.0, "reward": 0.6129847764968872, "reward_std": 10.648056983947754, "rewards/rollout_reward_func/mean": 0.6129847764968872, "rewards/rollout_reward_func/std": 10.648056983947754, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.59375, "sampling/sampling_logp_difference/mean": 0.3458252251148224, "step": 345, "step_time": 15.60945547598385 }, { "clip_ratio/high_max": 0.006013625679770485, "clip_ratio/high_mean": 0.006013625679770485, "clip_ratio/low_mean": 0.005043976940214634, "clip_ratio/low_min": 0.005043976940214634, "clip_ratio/region_mean": 0.011057602474465966, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 770.0, "completions/mean_terminated_length": 770.0, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "entropy": 0.04875108366832137, "epoch": 1.3840000553600022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.019548607990145683, "kl": 0.018660271889530122, "learning_rate": 7.4629799994691185e-06, "loss": 0.0001, "num_tokens": 5771878.0, "reward": -4.061265468597412, "reward_std": 3.607438325881958, "rewards/rollout_reward_func/mean": -4.061265468597412, "rewards/rollout_reward_func/std": 3.607438564300537, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.875, "sampling/sampling_logp_difference/mean": 0.365853488445282, "step": 346, "step_time": 15.669036947991117 }, { "clip_ratio/high_max": 0.00424010373535566, "clip_ratio/high_mean": 0.00424010373535566, "clip_ratio/low_mean": 0.0038090287998784333, "clip_ratio/low_min": 0.0038090287998784333, "clip_ratio/region_mean": 0.008049132535234094, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 770.5, "completions/mean_terminated_length": 770.5, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.0494697131216526, "epoch": 1.3880000555200021e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004159148316830397, "kl": 0.014408132643438876, "learning_rate": 7.462979999465688e-06, "loss": 0.0001, "num_tokens": 5797112.0, "reward": -3.7251780033111572, "reward_std": 8.192832946777344, "rewards/rollout_reward_func/mean": -3.7251780033111572, "rewards/rollout_reward_func/std": 8.19283390045166, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.6875, "sampling/sampling_logp_difference/mean": 0.3474154472351074, "step": 347, "step_time": 15.631650324001384 }, { "clip_ratio/high_max": 0.004847022355534136, "clip_ratio/high_mean": 0.004847022355534136, "clip_ratio/low_mean": 0.005977511405944824, "clip_ratio/low_min": 0.005977511405944824, "clip_ratio/region_mean": 0.01082453376147896, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 777.4375, "completions/mean_terminated_length": 777.4375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "entropy": 0.04858139343559742, "epoch": 1.3920000556800022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0021578408777713776, "kl": 0.012782125559169799, "learning_rate": 7.4629799994622466e-06, "loss": 0.0, "num_tokens": 5822459.0, "reward": -1.9393727779388428, "reward_std": 7.806313991546631, "rewards/rollout_reward_func/mean": -1.9393727779388428, "rewards/rollout_reward_func/std": 7.806313991546631, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.10546875, "sampling/sampling_logp_difference/mean": 0.36096274852752686, "step": 348, "step_time": 15.631217317997653 }, { "clip_ratio/high_max": 0.004001254157628864, "clip_ratio/high_mean": 0.004001254157628864, "clip_ratio/low_mean": 0.005144729700987227, "clip_ratio/low_min": 0.005144729700987227, "clip_ratio/region_mean": 0.009145983844064176, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 772.4375, "completions/mean_terminated_length": 772.4375, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "entropy": 0.04709845036268234, "epoch": 1.3960000558400023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002002219669520855, "kl": 0.012487411266192794, "learning_rate": 7.462979999458794e-06, "loss": 0.0, "num_tokens": 5847731.0, "reward": -0.5798298120498657, "reward_std": 7.678769111633301, "rewards/rollout_reward_func/mean": -0.5798298120498657, "rewards/rollout_reward_func/std": 7.678769588470459, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.98828125, "sampling/sampling_logp_difference/mean": 0.3521181344985962, "step": 349, "step_time": 15.552264584002842 }, { "clip_ratio/high_max": 0.0066296536242589355, "clip_ratio/high_mean": 0.0066296536242589355, "clip_ratio/low_mean": 0.004425009625265375, "clip_ratio/low_min": 0.004425009625265375, "clip_ratio/region_mean": 0.01105466327862814, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 771.4375, "completions/mean_terminated_length": 771.4375, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "entropy": 0.04750405577942729, "epoch": 1.4000000560000022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004153171554207802, "kl": 0.017844390938989818, "learning_rate": 7.4629799994553305e-06, "loss": 0.0001, "num_tokens": 5872985.0, "reward": -7.788618087768555, "reward_std": 8.49268913269043, "rewards/rollout_reward_func/mean": -7.788618087768555, "rewards/rollout_reward_func/std": 8.492690086364746, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.0, "sampling/sampling_logp_difference/mean": 0.3649453818798065, "step": 350, "step_time": 15.529238557006465 }, { "clip_ratio/high_max": 0.0026766981463879347, "clip_ratio/high_mean": 0.0026766981463879347, "clip_ratio/low_mean": 0.006402090308256447, "clip_ratio/low_min": 0.006402090308256447, "clip_ratio/region_mean": 0.009078788512852043, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 764.8125, "completions/mean_terminated_length": 764.8125, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "entropy": 0.04792841384187341, "epoch": 1.4040000561600023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024028392508625984, "kl": 0.015744391013868153, "learning_rate": 7.462979999451856e-06, "loss": 0.0001, "num_tokens": 5898112.0, "reward": -0.3295444846153259, "reward_std": 5.855626583099365, "rewards/rollout_reward_func/mean": -0.3295444846153259, "rewards/rollout_reward_func/std": 5.855626583099365, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.6875, "sampling/sampling_logp_difference/mean": 0.3576817810535431, "step": 351, "step_time": 15.57330867399287 }, { "clip_ratio/high_max": 0.007365958241280168, "clip_ratio/high_mean": 0.007365958241280168, "clip_ratio/low_mean": 0.004204307217150927, "clip_ratio/low_min": 0.004204307217150927, "clip_ratio/region_mean": 0.011570265400223434, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 804.75, "completions/mean_terminated_length": 804.75, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "entropy": 0.0459752082824707, "epoch": 1.4080000563200022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002092983340844512, "kl": 0.015749074518680573, "learning_rate": 7.46297999944837e-06, "loss": 0.0001, "num_tokens": 5923928.0, "reward": -1.3452718257904053, "reward_std": 13.049272537231445, "rewards/rollout_reward_func/mean": -1.3452718257904053, "rewards/rollout_reward_func/std": 13.049273490905762, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.89064407348633, "sampling/sampling_logp_difference/mean": 0.3397406339645386, "step": 352, "step_time": 15.733863558009034 }, { "clip_ratio/high_max": 0.00575511361239478, "clip_ratio/high_mean": 0.00575511361239478, "clip_ratio/low_mean": 0.005538981524296105, "clip_ratio/low_min": 0.005538981524296105, "clip_ratio/region_mean": 0.011294094962067902, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 775.875, "completions/mean_terminated_length": 775.875, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "entropy": 0.04737034672871232, "epoch": 1.4120000564800023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004977892152965069, "kl": 0.02091253479011357, "learning_rate": 7.462979999444872e-06, "loss": 0.0001, "num_tokens": 5949237.0, "reward": -4.173373222351074, "reward_std": 5.429490566253662, "rewards/rollout_reward_func/mean": -4.173373222351074, "rewards/rollout_reward_func/std": 5.42949104309082, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.03125, "sampling/sampling_logp_difference/mean": 0.35778194665908813, "step": 353, "step_time": 15.631008557014866 }, { "clip_ratio/high_max": 0.00363061684765853, "clip_ratio/high_mean": 0.00363061684765853, "clip_ratio/low_mean": 0.005157296865945682, "clip_ratio/low_min": 0.005157296865945682, "clip_ratio/region_mean": 0.008787913655396551, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 760.375, "completions/mean_terminated_length": 760.375, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "entropy": 0.049254370387643576, "epoch": 1.4160000566400022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026428899727761745, "kl": 0.015898832818493247, "learning_rate": 7.462979999441365e-06, "loss": 0.0001, "num_tokens": 5974299.0, "reward": -2.3151094913482666, "reward_std": 4.5245466232299805, "rewards/rollout_reward_func/mean": -2.3151094913482666, "rewards/rollout_reward_func/std": 4.5245466232299805, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.57814025878906, "sampling/sampling_logp_difference/mean": 0.35354098677635193, "step": 354, "step_time": 15.616530181992857 }, { "clip_ratio/high_max": 0.004884520196355879, "clip_ratio/high_mean": 0.004884520196355879, "clip_ratio/low_mean": 0.006798992777476087, "clip_ratio/low_min": 0.006798992777476087, "clip_ratio/region_mean": 0.011683513061143458, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 738.375, "completions/mean_terminated_length": 738.375, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "entropy": 0.05075767822563648, "epoch": 1.4200000568000023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.6490936279296875, "kl": 0.41917550342623144, "learning_rate": 7.462979999437846e-06, "loss": 0.0015, "num_tokens": 5998968.0, "reward": -3.1873111724853516, "reward_std": 5.3397932052612305, "rewards/rollout_reward_func/mean": -3.1873111724853516, "rewards/rollout_reward_func/std": 5.339792728424072, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.7265625, "sampling/sampling_logp_difference/mean": 0.3712223470211029, "step": 355, "step_time": 15.338218546006829 }, { "clip_ratio/high_max": 0.006267870892770588, "clip_ratio/high_mean": 0.006267870892770588, "clip_ratio/low_mean": 0.005019174015615135, "clip_ratio/low_min": 0.005019174015615135, "clip_ratio/region_mean": 0.011287044733762741, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 764.25, "completions/mean_terminated_length": 764.25, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "entropy": 0.05104076024144888, "epoch": 1.4240000569600024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002408270025625825, "kl": 0.015115724760107696, "learning_rate": 7.462979999434316e-06, "loss": 0.0001, "num_tokens": 6024105.0, "reward": -4.596035957336426, "reward_std": 6.394330024719238, "rewards/rollout_reward_func/mean": -4.596035957336426, "rewards/rollout_reward_func/std": 6.3943305015563965, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.46875, "sampling/sampling_logp_difference/mean": 0.3506583273410797, "step": 356, "step_time": 15.772185245994478 }, { "clip_ratio/high_max": 0.0054440199746750295, "clip_ratio/high_mean": 0.0054440199746750295, "clip_ratio/low_mean": 0.005227879213634878, "clip_ratio/low_min": 0.005227879213634878, "clip_ratio/region_mean": 0.010671899130102247, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 750.625, "completions/mean_terminated_length": 750.625, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "entropy": 0.0488477498292923, "epoch": 1.4280000571200023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002587122144177556, "kl": 0.01542477065231651, "learning_rate": 7.462979999430774e-06, "loss": 0.0001, "num_tokens": 6048999.0, "reward": -3.0885231494903564, "reward_std": 5.092945575714111, "rewards/rollout_reward_func/mean": -3.0885231494903564, "rewards/rollout_reward_func/std": 5.092945575714111, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.5855827331543, "sampling/sampling_logp_difference/mean": 0.35858669877052307, "step": 357, "step_time": 15.394407886989939 }, { "clip_ratio/high_max": 0.0037535749725066125, "clip_ratio/high_mean": 0.0037535749725066125, "clip_ratio/low_mean": 0.006657468024059199, "clip_ratio/low_min": 0.006657468024059199, "clip_ratio/region_mean": 0.010411042952910066, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 733.375, "completions/mean_terminated_length": 733.375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.051168684381991625, "epoch": 1.4320000572800024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006672271993011236, "kl": 0.019736990216188133, "learning_rate": 7.4629799994272234e-06, "loss": 0.0001, "num_tokens": 6073631.0, "reward": 2.126375913619995, "reward_std": 25.217851638793945, "rewards/rollout_reward_func/mean": 2.126375913619995, "rewards/rollout_reward_func/std": 25.217851638793945, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.28125, "sampling/sampling_logp_difference/mean": 0.3626718819141388, "step": 358, "step_time": 15.44461257999501 }, { "clip_ratio/high_max": 0.005682663177140057, "clip_ratio/high_mean": 0.005682663177140057, "clip_ratio/low_mean": 0.00350568990688771, "clip_ratio/low_min": 0.00350568990688771, "clip_ratio/region_mean": 0.009188353025820106, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 753.625, "completions/mean_terminated_length": 753.625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "entropy": 0.05125953257083893, "epoch": 1.4360000574400023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.03164741024374962, "kl": 0.03471219423227012, "learning_rate": 7.46297999942366e-06, "loss": 0.0001, "num_tokens": 6098618.0, "reward": 2.9184441566467285, "reward_std": 25.169483184814453, "rewards/rollout_reward_func/mean": 2.9184441566467285, "rewards/rollout_reward_func/std": 25.16948699951172, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.875, "sampling/sampling_logp_difference/mean": 0.3489964008331299, "step": 359, "step_time": 15.666244788997574 }, { "clip_ratio/high_max": 0.004878833191469312, "clip_ratio/high_mean": 0.004878833191469312, "clip_ratio/low_mean": 0.006185884005390108, "clip_ratio/low_min": 0.006185884005390108, "clip_ratio/region_mean": 0.011064717080444098, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 753.5, "completions/mean_terminated_length": 753.5, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.05031737592071295, "epoch": 1.4400000576000023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.020125500857830048, "kl": 0.03707901027519256, "learning_rate": 7.4629799994200855e-06, "loss": 0.0001, "num_tokens": 6123546.0, "reward": -6.03153133392334, "reward_std": 4.443475723266602, "rewards/rollout_reward_func/mean": -6.03153133392334, "rewards/rollout_reward_func/std": 4.44347620010376, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.8125, "sampling/sampling_logp_difference/mean": 0.35568761825561523, "step": 360, "step_time": 15.757149080003728 }, { "clip_ratio/high_max": 0.005865142302354798, "clip_ratio/high_mean": 0.005865142302354798, "clip_ratio/low_mean": 0.0035403525398578495, "clip_ratio/low_min": 0.0035403525398578495, "clip_ratio/region_mean": 0.009405494900420308, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 793.9375, "completions/mean_terminated_length": 793.9375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "entropy": 0.04881521128118038, "epoch": 1.4440000577600023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009937291964888573, "kl": 0.02898913435637951, "learning_rate": 7.4629799994165e-06, "loss": 0.0001, "num_tokens": 6149175.0, "reward": -3.264674663543701, "reward_std": 6.02699613571167, "rewards/rollout_reward_func/mean": -3.264674663543701, "rewards/rollout_reward_func/std": 6.02699613571167, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.59375, "sampling/sampling_logp_difference/mean": 0.3415757417678833, "step": 361, "step_time": 15.680471558000136 }, { "clip_ratio/high_max": 0.003986214462202042, "clip_ratio/high_mean": 0.003986214462202042, "clip_ratio/low_mean": 0.005329250998329371, "clip_ratio/low_min": 0.005329250998329371, "clip_ratio/region_mean": 0.009315465460531414, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 755.9375, "completions/mean_terminated_length": 755.9375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.05085682915523648, "epoch": 1.4480000579200023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01149328425526619, "kl": 0.02946215239353478, "learning_rate": 7.462979999412904e-06, "loss": 0.0001, "num_tokens": 6174192.0, "reward": 7.734464168548584, "reward_std": 22.881126403808594, "rewards/rollout_reward_func/mean": 7.734464168548584, "rewards/rollout_reward_func/std": 22.88112449645996, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.0, "sampling/sampling_logp_difference/mean": 0.3465244174003601, "step": 362, "step_time": 15.576394026014896 }, { "clip_ratio/high_max": 0.005582637677434832, "clip_ratio/high_mean": 0.005582637677434832, "clip_ratio/low_mean": 0.005677561974152923, "clip_ratio/low_min": 0.005677561974152923, "clip_ratio/region_mean": 0.011260199709795415, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 749.375, "completions/mean_terminated_length": 749.375, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "entropy": 0.05343257123604417, "epoch": 1.4520000580800022e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.04550778120756149, "kl": 0.0536448456114158, "learning_rate": 7.462979999409297e-06, "loss": 0.0002, "num_tokens": 6199064.0, "reward": -3.4335198402404785, "reward_std": 9.116057395935059, "rewards/rollout_reward_func/mean": -3.4335198402404785, "rewards/rollout_reward_func/std": 9.116057395935059, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.31255340576172, "sampling/sampling_logp_difference/mean": 0.3608986437320709, "step": 363, "step_time": 15.282325740008673 }, { "clip_ratio/high_max": 0.004492041451158002, "clip_ratio/high_mean": 0.004492041451158002, "clip_ratio/low_mean": 0.00572157115675509, "clip_ratio/low_min": 0.00572157115675509, "clip_ratio/region_mean": 0.010213612578809261, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 794.8125, "completions/mean_terminated_length": 794.8125, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "entropy": 0.04797307588160038, "epoch": 1.4560000582400023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02481665089726448, "kl": 0.04031460708938539, "learning_rate": 7.462979999405678e-06, "loss": 0.0002, "num_tokens": 6224724.0, "reward": 0.8895223140716553, "reward_std": 11.40249252319336, "rewards/rollout_reward_func/mean": 0.8895223140716553, "rewards/rollout_reward_func/std": 11.40249252319336, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.3125, "sampling/sampling_logp_difference/mean": 0.3344103991985321, "step": 364, "step_time": 15.698359872003493 }, { "clip_ratio/high_max": 0.0040246303542517126, "clip_ratio/high_mean": 0.0040246303542517126, "clip_ratio/low_mean": 0.0037729320174548775, "clip_ratio/low_min": 0.0037729320174548775, "clip_ratio/region_mean": 0.0077975624008104205, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 786.5625, "completions/mean_terminated_length": 786.5625, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "entropy": 0.05223047034814954, "epoch": 1.4600000584000024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003328077495098114, "kl": 0.024773186771199107, "learning_rate": 7.462979999402048e-06, "loss": 0.0001, "num_tokens": 6250236.0, "reward": -2.7268283367156982, "reward_std": 8.905264854431152, "rewards/rollout_reward_func/mean": -2.7268283367156982, "rewards/rollout_reward_func/std": 8.905264854431152, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.5234375, "sampling/sampling_logp_difference/mean": 0.32337576150894165, "step": 365, "step_time": 15.583201639004983 }, { "clip_ratio/high_max": 0.00547191800433211, "clip_ratio/high_mean": 0.00547191800433211, "clip_ratio/low_mean": 0.003819767211098224, "clip_ratio/low_min": 0.003819767211098224, "clip_ratio/region_mean": 0.009291685244534165, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 753.75, "completions/mean_terminated_length": 753.75, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "entropy": 0.05489199608564377, "epoch": 1.4640000585600023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00343631231226027, "kl": 0.026844141772016883, "learning_rate": 7.462979999398408e-06, "loss": 0.0001, "num_tokens": 6275183.0, "reward": -2.7439088821411133, "reward_std": 7.373962879180908, "rewards/rollout_reward_func/mean": -2.7439088821411133, "rewards/rollout_reward_func/std": 7.373963356018066, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.43315505981445, "sampling/sampling_logp_difference/mean": 0.34808894991874695, "step": 366, "step_time": 15.529907058997196 }, { "clip_ratio/high_max": 0.005415625404566526, "clip_ratio/high_mean": 0.005415625404566526, "clip_ratio/low_mean": 0.003935463551897556, "clip_ratio/low_min": 0.003935463551897556, "clip_ratio/region_mean": 0.009351088898256421, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 778.6875, "completions/mean_terminated_length": 778.6875, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "entropy": 0.05261731380596757, "epoch": 1.4680000587200024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004708635620772839, "kl": 0.029752518283203244, "learning_rate": 7.462979999394756e-06, "loss": 0.0001, "num_tokens": 6300562.0, "reward": -6.366008758544922, "reward_std": 6.840712547302246, "rewards/rollout_reward_func/mean": -6.366008758544922, "rewards/rollout_reward_func/std": 6.840713024139404, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.09375, "sampling/sampling_logp_difference/mean": 0.3412788510322571, "step": 367, "step_time": 15.73830424300104 }, { "clip_ratio/high_max": 0.004952353774569929, "clip_ratio/high_mean": 0.004952353774569929, "clip_ratio/low_mean": 0.005313803558237851, "clip_ratio/low_min": 0.005313803558237851, "clip_ratio/region_mean": 0.010266157274600118, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 760.625, "completions/mean_terminated_length": 760.625, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "entropy": 0.050388614647090435, "epoch": 1.4720000588800023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005702740978449583, "kl": 0.03421391220763326, "learning_rate": 7.462979999391094e-06, "loss": 0.0001, "num_tokens": 6325608.0, "reward": -1.3090240955352783, "reward_std": 4.742094993591309, "rewards/rollout_reward_func/mean": -1.3090240955352783, "rewards/rollout_reward_func/std": 4.742094993591309, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 35.3125, "sampling/sampling_logp_difference/mean": 0.3508956730365753, "step": 368, "step_time": 15.506929035014764 }, { "clip_ratio/high_max": 0.004685435327701271, "clip_ratio/high_mean": 0.004685435327701271, "clip_ratio/low_mean": 0.0052543585625244305, "clip_ratio/low_min": 0.0052543585625244305, "clip_ratio/region_mean": 0.009939793904777616, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 759.8125, "completions/mean_terminated_length": 759.8125, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "entropy": 0.051974961534142494, "epoch": 1.4760000590400024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028263817075639963, "kl": 0.030398622155189514, "learning_rate": 7.4629799993874205e-06, "loss": 0.0001, "num_tokens": 6350671.0, "reward": -1.1944042444229126, "reward_std": 5.766422748565674, "rewards/rollout_reward_func/mean": -1.1944042444229126, "rewards/rollout_reward_func/std": 5.766422748565674, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.921875, "sampling/sampling_logp_difference/mean": 0.3524239659309387, "step": 369, "step_time": 15.656627186013793 }, { "clip_ratio/high_max": 0.004927885573124513, "clip_ratio/high_mean": 0.004927885573124513, "clip_ratio/low_mean": 0.0044319248117972165, "clip_ratio/low_min": 0.0044319248117972165, "clip_ratio/region_mean": 0.009359810326714069, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 761.9375, "completions/mean_terminated_length": 761.9375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "entropy": 0.051870225463062525, "epoch": 1.4800000592000023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007202779874205589, "kl": 0.039940948598086834, "learning_rate": 7.462979999383736e-06, "loss": 0.0001, "num_tokens": 6375762.0, "reward": -3.57757306098938, "reward_std": 3.7848215103149414, "rewards/rollout_reward_func/mean": -3.57757306098938, "rewards/rollout_reward_func/std": 3.7848217487335205, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.726566314697266, "sampling/sampling_logp_difference/mean": 0.3341920077800751, "step": 370, "step_time": 15.39941358899523 }, { "clip_ratio/high_max": 0.004796441236976534, "clip_ratio/high_mean": 0.004796441236976534, "clip_ratio/low_mean": 0.0033599957096157596, "clip_ratio/low_min": 0.0033599957096157596, "clip_ratio/region_mean": 0.008156436961144209, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 765.4375, "completions/mean_terminated_length": 765.4375, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "entropy": 0.051918181125074625, "epoch": 1.4840000593600024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035183108411729336, "kl": 0.028624419355764985, "learning_rate": 7.46297999938004e-06, "loss": 0.0001, "num_tokens": 6400921.0, "reward": -4.0116801261901855, "reward_std": 7.759888648986816, "rewards/rollout_reward_func/mean": -4.0116801261901855, "rewards/rollout_reward_func/std": 7.759889125823975, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.4375, "sampling/sampling_logp_difference/mean": 0.32248467206954956, "step": 371, "step_time": 15.697111377005058 }, { "clip_ratio/high_max": 0.0050623922143131495, "clip_ratio/high_mean": 0.0050623922143131495, "clip_ratio/low_mean": 0.003912129963282496, "clip_ratio/low_min": 0.003912129963282496, "clip_ratio/region_mean": 0.008974522119387984, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 749.3125, "completions/mean_terminated_length": 749.3125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.050463775638490915, "epoch": 1.4880000595200023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004931293427944183, "kl": 0.0316412562970072, "learning_rate": 7.462979999376333e-06, "loss": 0.0001, "num_tokens": 6425833.0, "reward": 2.9470183849334717, "reward_std": 25.843875885009766, "rewards/rollout_reward_func/mean": 2.9470183849334717, "rewards/rollout_reward_func/std": 25.8438720703125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.1875, "sampling/sampling_logp_difference/mean": 0.32414233684539795, "step": 372, "step_time": 15.616432655995595 }, { "clip_ratio/high_max": 0.006657718971837312, "clip_ratio/high_mean": 0.006657718971837312, "clip_ratio/low_mean": 0.0034170769795309752, "clip_ratio/low_min": 0.0034170769795309752, "clip_ratio/region_mean": 0.010074795864056796, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 769.3125, "completions/mean_terminated_length": 769.3125, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "entropy": 0.052219620905816555, "epoch": 1.4920000596800024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005325466860085726, "kl": 0.034084631595760584, "learning_rate": 7.462979999372614e-06, "loss": 0.0001, "num_tokens": 6451042.0, "reward": -4.290533065795898, "reward_std": 6.547536373138428, "rewards/rollout_reward_func/mean": -4.290533065795898, "rewards/rollout_reward_func/std": 6.547536849975586, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.75, "sampling/sampling_logp_difference/mean": 0.3397327661514282, "step": 373, "step_time": 15.490605223996681 }, { "clip_ratio/high_max": 0.006140714045614004, "clip_ratio/high_mean": 0.006140714045614004, "clip_ratio/low_mean": 0.0038661329890601337, "clip_ratio/low_min": 0.0038661329890601337, "clip_ratio/region_mean": 0.010006847092881799, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 761.0625, "completions/mean_terminated_length": 761.0625, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "entropy": 0.05366767104715109, "epoch": 1.4960000598400025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0051289768889546394, "kl": 0.03242518310435116, "learning_rate": 7.462979999368886e-06, "loss": 0.0001, "num_tokens": 6476105.0, "reward": -5.961989402770996, "reward_std": 12.070269584655762, "rewards/rollout_reward_func/mean": -5.961989402770996, "rewards/rollout_reward_func/std": 12.070268630981445, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.71875, "sampling/sampling_logp_difference/mean": 0.3295060992240906, "step": 374, "step_time": 15.677240236000216 }, { "clip_ratio/high_max": 0.0033940470311790705, "clip_ratio/high_mean": 0.0033940470311790705, "clip_ratio/low_mean": 0.005808552348753437, "clip_ratio/low_min": 0.005808552348753437, "clip_ratio/region_mean": 0.009202599350828677, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 790.5625, "completions/mean_terminated_length": 790.5625, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "entropy": 0.04944710107520223, "epoch": 1.5000000600000024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002992932917550206, "kl": 0.03108877339400351, "learning_rate": 7.462979999365147e-06, "loss": 0.0001, "num_tokens": 6501691.0, "reward": -0.5460730195045471, "reward_std": 7.997137546539307, "rewards/rollout_reward_func/mean": -0.5460730195045471, "rewards/rollout_reward_func/std": 7.997138023376465, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.125, "sampling/sampling_logp_difference/mean": 0.32222387194633484, "step": 375, "step_time": 15.757063723998726 }, { "clip_ratio/high_max": 0.005896504793781787, "clip_ratio/high_mean": 0.005896504793781787, "clip_ratio/low_mean": 0.0042698371689766645, "clip_ratio/low_min": 0.0042698371689766645, "clip_ratio/region_mean": 0.010166341962758452, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 757.1875, "completions/mean_terminated_length": 757.1875, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "entropy": 0.051800696179270744, "epoch": 1.5040000601600025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003774871351197362, "kl": 0.03114943322725594, "learning_rate": 7.462979999361396e-06, "loss": 0.0001, "num_tokens": 6526690.0, "reward": -5.011287689208984, "reward_std": 3.4411754608154297, "rewards/rollout_reward_func/mean": -5.011287689208984, "rewards/rollout_reward_func/std": 3.441175937652588, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.6875, "sampling/sampling_logp_difference/mean": 0.34076598286628723, "step": 376, "step_time": 15.384972903993912 }, { "clip_ratio/high_max": 0.0032860704814083874, "clip_ratio/high_mean": 0.0032860704814083874, "clip_ratio/low_mean": 0.004857097257627174, "clip_ratio/low_min": 0.004857097257627174, "clip_ratio/region_mean": 0.00814316765172407, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 718.3125, "completions/mean_terminated_length": 718.3125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.05272270832210779, "epoch": 1.5080000603200024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01313093677163124, "kl": 0.03860932448878884, "learning_rate": 7.462979999357635e-06, "loss": 0.0001, "num_tokens": 6551053.0, "reward": 3.453343629837036, "reward_std": 27.198333740234375, "rewards/rollout_reward_func/mean": 3.453343629837036, "rewards/rollout_reward_func/std": 27.198333740234375, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.3125, "sampling/sampling_logp_difference/mean": 0.3437908887863159, "step": 377, "step_time": 15.541160148990457 }, { "clip_ratio/high_max": 0.005390767415519804, "clip_ratio/high_mean": 0.005390767415519804, "clip_ratio/low_mean": 0.004487221885938197, "clip_ratio/low_min": 0.004487221885938197, "clip_ratio/region_mean": 0.00987798918504268, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 738.6875, "completions/mean_terminated_length": 738.6875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.051679805386811495, "epoch": 1.5120000604800024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037231664173305035, "kl": 0.030981865944340825, "learning_rate": 7.462979999353861e-06, "loss": 0.0001, "num_tokens": 6575778.0, "reward": 0.807816743850708, "reward_std": 25.802125930786133, "rewards/rollout_reward_func/mean": 0.807816743850708, "rewards/rollout_reward_func/std": 25.802125930786133, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.47657012939453, "sampling/sampling_logp_difference/mean": 0.3514697551727295, "step": 378, "step_time": 15.50950541599741 }, { "clip_ratio/high_max": 0.00599946323200129, "clip_ratio/high_mean": 0.00599946323200129, "clip_ratio/low_mean": 0.0026948873419314623, "clip_ratio/low_min": 0.0026948873419314623, "clip_ratio/region_mean": 0.0086943504284136, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 770.6875, "completions/mean_terminated_length": 770.6875, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "entropy": 0.05278319027274847, "epoch": 1.5160000606400024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009503177367150784, "kl": 0.03255314799025655, "learning_rate": 7.4629799993500765e-06, "loss": 0.0001, "num_tokens": 6601013.0, "reward": -3.571002244949341, "reward_std": 10.394737243652344, "rewards/rollout_reward_func/mean": -3.571002244949341, "rewards/rollout_reward_func/std": 10.394737243652344, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.46875, "sampling/sampling_logp_difference/mean": 0.3293013870716095, "step": 379, "step_time": 15.645773482996447 }, { "clip_ratio/high_max": 0.002278203028254211, "clip_ratio/high_mean": 0.002278203028254211, "clip_ratio/low_mean": 0.005940391158219427, "clip_ratio/low_min": 0.005940391158219427, "clip_ratio/region_mean": 0.008218594244681299, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 749.0, "completions/mean_terminated_length": 749.0, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.05481242015957832, "epoch": 1.5200000608000024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026518176309764385, "kl": 0.024836050579324365, "learning_rate": 7.462979999346282e-06, "loss": 0.0001, "num_tokens": 6625912.0, "reward": 2.9539670944213867, "reward_std": 19.752119064331055, "rewards/rollout_reward_func/mean": 2.9539670944213867, "rewards/rollout_reward_func/std": 19.752119064331055, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.59375762939453, "sampling/sampling_logp_difference/mean": 0.3311879336833954, "step": 380, "step_time": 15.661703754980408 }, { "clip_ratio/high_max": 0.007003870792686939, "clip_ratio/high_mean": 0.007003870792686939, "clip_ratio/low_mean": 0.004100152291357517, "clip_ratio/low_min": 0.004100152291357517, "clip_ratio/region_mean": 0.011104023084044456, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 759.1875, "completions/mean_terminated_length": 759.1875, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "entropy": 0.04961817665025592, "epoch": 1.5240000609600025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0040076239965856075, "kl": 0.03609227016568184, "learning_rate": 7.462979999342475e-06, "loss": 0.0001, "num_tokens": 6650941.0, "reward": -3.6426987648010254, "reward_std": 5.602127552032471, "rewards/rollout_reward_func/mean": -3.6426987648010254, "rewards/rollout_reward_func/std": 5.602127552032471, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.65625, "sampling/sampling_logp_difference/mean": 0.35581180453300476, "step": 381, "step_time": 15.576350267991074 }, { "clip_ratio/high_max": 0.004433682741364464, "clip_ratio/high_mean": 0.004433682741364464, "clip_ratio/low_mean": 0.00507182243745774, "clip_ratio/low_min": 0.00507182243745774, "clip_ratio/region_mean": 0.009505505149718374, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 780.125, "completions/mean_terminated_length": 780.125, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "entropy": 0.05075050005689263, "epoch": 1.5280000611200024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00281888572499156, "kl": 0.02781125484034419, "learning_rate": 7.462979999338658e-06, "loss": 0.0001, "num_tokens": 6676343.0, "reward": -4.21556282043457, "reward_std": 10.447161674499512, "rewards/rollout_reward_func/mean": -4.21556282043457, "rewards/rollout_reward_func/std": 10.447161674499512, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.681640625, "sampling/sampling_logp_difference/mean": 0.3422618806362152, "step": 382, "step_time": 15.719687431002967 }, { "clip_ratio/high_max": 0.004782348376465961, "clip_ratio/high_mean": 0.004782348376465961, "clip_ratio/low_mean": 0.006059475592337549, "clip_ratio/low_min": 0.006059475592337549, "clip_ratio/region_mean": 0.01084182399790734, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 761.5625, "completions/mean_terminated_length": 761.5625, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.05046800244599581, "epoch": 1.5320000612800025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008261616341769695, "kl": 0.03195733833126724, "learning_rate": 7.462979999334829e-06, "loss": 0.0001, "num_tokens": 6701419.0, "reward": -2.801422119140625, "reward_std": 5.5699615478515625, "rewards/rollout_reward_func/mean": -2.801422119140625, "rewards/rollout_reward_func/std": 5.5699615478515625, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.5, "sampling/sampling_logp_difference/mean": 0.3560677468776703, "step": 383, "step_time": 15.685733831996913 }, { "clip_ratio/high_max": 0.006166586186736822, "clip_ratio/high_mean": 0.006166586186736822, "clip_ratio/low_mean": 0.0026877758209593594, "clip_ratio/low_min": 0.0026877758209593594, "clip_ratio/region_mean": 0.008854362065903842, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 798.0625, "completions/mean_terminated_length": 798.0625, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "entropy": 0.04778579901903868, "epoch": 1.5360000614400026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004185694735497236, "kl": 0.027696145232766867, "learning_rate": 7.4629799993309895e-06, "loss": 0.0001, "num_tokens": 6727126.0, "reward": -6.105064392089844, "reward_std": 10.051389694213867, "rewards/rollout_reward_func/mean": -6.105064392089844, "rewards/rollout_reward_func/std": 10.051390647888184, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.1875, "sampling/sampling_logp_difference/mean": 0.3436160683631897, "step": 384, "step_time": 15.849247177997313 }, { "clip_ratio/high_max": 0.00551637724856846, "clip_ratio/high_mean": 0.00551637724856846, "clip_ratio/low_mean": 0.0036177282745484263, "clip_ratio/low_min": 0.0036177282745484263, "clip_ratio/region_mean": 0.009134105523116887, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 763.9375, "completions/mean_terminated_length": 763.9375, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "entropy": 0.05134060746058822, "epoch": 1.5400000616000023e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00504625728353858, "kl": 0.026903083780780435, "learning_rate": 7.46297999932714e-06, "loss": 0.0001, "num_tokens": 6752248.0, "reward": -5.0062408447265625, "reward_std": 9.779511451721191, "rewards/rollout_reward_func/mean": -5.0062408447265625, "rewards/rollout_reward_func/std": 9.779511451721191, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.289066314697266, "sampling/sampling_logp_difference/mean": 0.3352620005607605, "step": 385, "step_time": 15.682092871997156 }, { "clip_ratio/high_max": 0.004583560221362859, "clip_ratio/high_mean": 0.004583560221362859, "clip_ratio/low_mean": 0.003783444524742663, "clip_ratio/low_min": 0.003783444524742663, "clip_ratio/region_mean": 0.008367004804313183, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 775.4375, "completions/mean_terminated_length": 775.4375, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "entropy": 0.050676772836595774, "epoch": 1.5440000617600024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.016999561339616776, "kl": 0.03015987970866263, "learning_rate": 7.462979999323278e-06, "loss": 0.0001, "num_tokens": 6777557.0, "reward": -5.111053466796875, "reward_std": 12.52268123626709, "rewards/rollout_reward_func/mean": -5.111053466796875, "rewards/rollout_reward_func/std": 12.52268123626709, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.9375, "sampling/sampling_logp_difference/mean": 0.33781757950782776, "step": 386, "step_time": 15.681333023996558 }, { "clip_ratio/high_max": 0.005138521752087399, "clip_ratio/high_mean": 0.005138521752087399, "clip_ratio/low_mean": 0.0054023946868255734, "clip_ratio/low_min": 0.0054023946868255734, "clip_ratio/region_mean": 0.010540916468016803, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 767.0, "completions/mean_terminated_length": 767.0, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "entropy": 0.05147658195346594, "epoch": 1.5480000619200025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005441848188638687, "kl": 0.02608946361579001, "learning_rate": 7.4629799993194055e-06, "loss": 0.0001, "num_tokens": 6802754.0, "reward": 1.7579165697097778, "reward_std": 12.030631065368652, "rewards/rollout_reward_func/mean": 1.7579165697097778, "rewards/rollout_reward_func/std": 12.030632972717285, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.8046875, "sampling/sampling_logp_difference/mean": 0.35169410705566406, "step": 387, "step_time": 15.536019686987856 }, { "clip_ratio/high_max": 0.004097190801985562, "clip_ratio/high_mean": 0.004097190801985562, "clip_ratio/low_mean": 0.005256902513792738, "clip_ratio/low_min": 0.005256902513792738, "clip_ratio/region_mean": 0.009354093403089792, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 747.0625, "completions/mean_terminated_length": 747.0625, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "entropy": 0.04822982335463166, "epoch": 1.5520000620800026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006872978992760181, "kl": 0.031982728745788336, "learning_rate": 7.462979999315522e-06, "loss": 0.0001, "num_tokens": 6827570.0, "reward": -1.0015848875045776, "reward_std": 6.7907938957214355, "rewards/rollout_reward_func/mean": -1.0015848875045776, "rewards/rollout_reward_func/std": 6.790794372558594, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 33.5625, "sampling/sampling_logp_difference/mean": 0.3496389389038086, "step": 388, "step_time": 15.394894755001587 }, { "clip_ratio/high_max": 0.004049097071401775, "clip_ratio/high_mean": 0.004049097071401775, "clip_ratio/low_mean": 0.005618516588583589, "clip_ratio/low_min": 0.005618516588583589, "clip_ratio/region_mean": 0.009667613601777703, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 762.125, "completions/mean_terminated_length": 762.125, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "entropy": 0.05325077008455992, "epoch": 1.5560000622400027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005581921432167292, "kl": 0.0252778516151011, "learning_rate": 7.462979999311628e-06, "loss": 0.0001, "num_tokens": 6852664.0, "reward": -2.6253557205200195, "reward_std": 4.263331413269043, "rewards/rollout_reward_func/mean": -2.6253557205200195, "rewards/rollout_reward_func/std": 4.263331413269043, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.78125, "sampling/sampling_logp_difference/mean": 0.3480810225009918, "step": 389, "step_time": 15.701161494995176 }, { "clip_ratio/high_max": 0.005086805671453476, "clip_ratio/high_mean": 0.005086805671453476, "clip_ratio/low_mean": 0.004967884859070182, "clip_ratio/low_min": 0.004967884859070182, "clip_ratio/region_mean": 0.010054690472315997, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 773.25, "completions/mean_terminated_length": 773.25, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "entropy": 0.048244206700474024, "epoch": 1.5600000624000024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.04126261547207832, "kl": 0.046710497699677944, "learning_rate": 7.462979999307722e-06, "loss": 0.0002, "num_tokens": 6877957.0, "reward": -2.055720329284668, "reward_std": 7.709848880767822, "rewards/rollout_reward_func/mean": -2.055720329284668, "rewards/rollout_reward_func/std": 7.709849834442139, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.625, "sampling/sampling_logp_difference/mean": 0.33645838499069214, "step": 390, "step_time": 15.746769744007906 }, { "clip_ratio/high_max": 0.00540886729140766, "clip_ratio/high_mean": 0.00540886729140766, "clip_ratio/low_mean": 0.003976903157308698, "clip_ratio/low_min": 0.003976903157308698, "clip_ratio/region_mean": 0.009385770419612527, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 787.875, "completions/mean_terminated_length": 787.875, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "entropy": 0.04900149768218398, "epoch": 1.5640000625600025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031901777256280184, "kl": 0.024959378642961383, "learning_rate": 7.462979999303806e-06, "loss": 0.0001, "num_tokens": 6903480.0, "reward": -2.422349214553833, "reward_std": 11.221878051757812, "rewards/rollout_reward_func/mean": -2.422349214553833, "rewards/rollout_reward_func/std": 11.221878051757812, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.421875, "sampling/sampling_logp_difference/mean": 0.33265018463134766, "step": 391, "step_time": 15.720651091003674 }, { "clip_ratio/high_max": 0.003652030194643885, "clip_ratio/high_mean": 0.003652030194643885, "clip_ratio/low_mean": 0.006348286755383015, "clip_ratio/low_min": 0.006348286755383015, "clip_ratio/region_mean": 0.010000316775403917, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 764.375, "completions/mean_terminated_length": 764.375, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "entropy": 0.05277464026585221, "epoch": 1.5680000627200026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0023786418605595827, "kl": 0.022558885859325528, "learning_rate": 7.462979999299877e-06, "loss": 0.0001, "num_tokens": 6928619.0, "reward": -1.9823788404464722, "reward_std": 6.871199131011963, "rewards/rollout_reward_func/mean": -1.9823788404464722, "rewards/rollout_reward_func/std": 6.871199607849121, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.4375, "sampling/sampling_logp_difference/mean": 0.3386159837245941, "step": 392, "step_time": 15.841974476999894 }, { "clip_ratio/high_max": 0.006684118416160345, "clip_ratio/high_mean": 0.006684118416160345, "clip_ratio/low_mean": 0.0034838106366805732, "clip_ratio/low_min": 0.0034838106366805732, "clip_ratio/region_mean": 0.010167929052840918, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 768.75, "completions/mean_terminated_length": 768.75, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "entropy": 0.047466170974075794, "epoch": 1.5720000628800026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002183799399062991, "kl": 0.02250981912948191, "learning_rate": 7.462979999295938e-06, "loss": 0.0001, "num_tokens": 6953822.0, "reward": -4.1765031814575195, "reward_std": 7.129260063171387, "rewards/rollout_reward_func/mean": -4.1765031814575195, "rewards/rollout_reward_func/std": 7.1292595863342285, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.078125, "sampling/sampling_logp_difference/mean": 0.3445900082588196, "step": 393, "step_time": 15.741972886004078 }, { "clip_ratio/high_max": 0.007154918042942882, "clip_ratio/high_mean": 0.007154918042942882, "clip_ratio/low_mean": 0.003438497195020318, "clip_ratio/low_min": 0.003438497195020318, "clip_ratio/region_mean": 0.01059341529617086, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 771.9375, "completions/mean_terminated_length": 771.9375, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "entropy": 0.05156597448512912, "epoch": 1.5760000630400024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002383087994530797, "kl": 0.02186523179989308, "learning_rate": 7.462979999291989e-06, "loss": 0.0001, "num_tokens": 6979072.0, "reward": -4.550000190734863, "reward_std": 6.68671178817749, "rewards/rollout_reward_func/mean": -4.550000190734863, "rewards/rollout_reward_func/std": 6.68671178817749, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.0625, "sampling/sampling_logp_difference/mean": 0.35144752264022827, "step": 394, "step_time": 15.782682040000509 }, { "clip_ratio/high_max": 0.005622584605589509, "clip_ratio/high_mean": 0.005622584605589509, "clip_ratio/low_mean": 0.004909976705675945, "clip_ratio/low_min": 0.004909976705675945, "clip_ratio/region_mean": 0.010532561223953962, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 742.9375, "completions/mean_terminated_length": 742.9375, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "entropy": 0.05158217251300812, "epoch": 1.5800000632000025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003911233507096767, "kl": 0.027845494681969285, "learning_rate": 7.462979999288027e-06, "loss": 0.0001, "num_tokens": 7003832.0, "reward": -3.44034481048584, "reward_std": 7.979660511016846, "rewards/rollout_reward_func/mean": -3.44034481048584, "rewards/rollout_reward_func/std": 7.979660987854004, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.884769439697266, "sampling/sampling_logp_difference/mean": 0.34593793749809265, "step": 395, "step_time": 15.37073092499486 }, { "clip_ratio/high_max": 0.006145882012788206, "clip_ratio/high_mean": 0.006145882012788206, "clip_ratio/low_mean": 0.0036367044085636735, "clip_ratio/low_min": 0.0036367044085636735, "clip_ratio/region_mean": 0.00978258642135188, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 746.25, "completions/mean_terminated_length": 746.25, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "entropy": 0.04946057125926018, "epoch": 1.5840000633600025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004968792200088501, "kl": 0.02986625744961202, "learning_rate": 7.462979999284056e-06, "loss": 0.0001, "num_tokens": 7028633.0, "reward": -7.097359657287598, "reward_std": 6.753442764282227, "rewards/rollout_reward_func/mean": -7.097359657287598, "rewards/rollout_reward_func/std": 6.753442764282227, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 35.977630615234375, "sampling/sampling_logp_difference/mean": 0.34801939129829407, "step": 396, "step_time": 15.551142007992894 }, { "clip_ratio/high_max": 0.005246163724223152, "clip_ratio/high_mean": 0.005246163724223152, "clip_ratio/low_mean": 0.004388025583466515, "clip_ratio/low_min": 0.004388025583466515, "clip_ratio/region_mean": 0.009634189249482006, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 743.5, "completions/mean_terminated_length": 743.5, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.05208042869344354, "epoch": 1.5880000635200026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022652973420917988, "kl": 0.019157650647684932, "learning_rate": 7.4629799992800735e-06, "loss": 0.0001, "num_tokens": 7053451.0, "reward": 1.2421526908874512, "reward_std": 28.531606674194336, "rewards/rollout_reward_func/mean": 1.2421526908874512, "rewards/rollout_reward_func/std": 28.531606674194336, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.06640625, "sampling/sampling_logp_difference/mean": 0.35279837250709534, "step": 397, "step_time": 15.576171302993316 }, { "clip_ratio/high_max": 0.005392818333348259, "clip_ratio/high_mean": 0.005392818333348259, "clip_ratio/low_mean": 0.00367357034701854, "clip_ratio/low_min": 0.00367357034701854, "clip_ratio/region_mean": 0.009066388593055308, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 770.625, "completions/mean_terminated_length": 770.625, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "entropy": 0.04934491543099284, "epoch": 1.5920000636800027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024368080776184797, "kl": 0.02262735366821289, "learning_rate": 7.46297999927608e-06, "loss": 0.0001, "num_tokens": 7078687.0, "reward": -2.3792221546173096, "reward_std": 5.039938449859619, "rewards/rollout_reward_func/mean": -2.3792221546173096, "rewards/rollout_reward_func/std": 5.039938449859619, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.90625, "sampling/sampling_logp_difference/mean": 0.34471070766448975, "step": 398, "step_time": 15.789743961009663 }, { "clip_ratio/high_max": 0.003521235630614683, "clip_ratio/high_mean": 0.003521235630614683, "clip_ratio/low_mean": 0.006128766428446397, "clip_ratio/low_min": 0.006128766428446397, "clip_ratio/region_mean": 0.00965000205906108, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 765.3125, "completions/mean_terminated_length": 765.3125, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "entropy": 0.050505613442510366, "epoch": 1.5960000638400025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024737846106290817, "kl": 0.021988302934914827, "learning_rate": 7.462979999272074e-06, "loss": 0.0001, "num_tokens": 7103844.0, "reward": -0.2974691390991211, "reward_std": 10.909343719482422, "rewards/rollout_reward_func/mean": -0.2974691390991211, "rewards/rollout_reward_func/std": 10.909343719482422, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.875, "sampling/sampling_logp_difference/mean": 0.3398814797401428, "step": 399, "step_time": 19.788060676008172 }, { "clip_ratio/high_max": 0.0036286261456552893, "clip_ratio/high_mean": 0.0036286261456552893, "clip_ratio/low_mean": 0.006802471558330581, "clip_ratio/low_min": 0.006802471558330581, "clip_ratio/region_mean": 0.010431097645778209, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 760.0, "completions/mean_terminated_length": 760.0, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "entropy": 0.04913257109001279, "epoch": 1.6000000640000025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00475487532094121, "kl": 0.022771736606955528, "learning_rate": 7.4629799992680575e-06, "loss": 0.0001, "num_tokens": 7128891.0, "reward": -2.5548665523529053, "reward_std": 7.432679653167725, "rewards/rollout_reward_func/mean": -2.5548665523529053, "rewards/rollout_reward_func/std": 7.432679176330566, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.375, "sampling/sampling_logp_difference/mean": 0.344637930393219, "step": 400, "step_time": 15.57796607301134 }, { "clip_ratio/high_max": 0.004215686465613544, "clip_ratio/high_mean": 0.004215686465613544, "clip_ratio/low_mean": 0.004964448016835377, "clip_ratio/low_min": 0.004964448016835377, "clip_ratio/region_mean": 0.009180134511552751, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 959.8125, "completions/mean_terminated_length": 959.8125, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "entropy": 0.046566936653107405, "epoch": 1.6040000641600026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.07958182692527771, "kl": 0.07384899980388582, "learning_rate": 7.462979999264031e-06, "loss": 0.0003, "num_tokens": 7157170.0, "reward": 3.3236284255981445, "reward_std": 9.936704635620117, "rewards/rollout_reward_func/mean": 3.3236284255981445, "rewards/rollout_reward_func/std": 9.936704635620117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.3125, "sampling/sampling_logp_difference/mean": 0.3424290120601654, "step": 401, "step_time": 18.15795979899849 }, { "clip_ratio/high_max": 0.0031806410988792777, "clip_ratio/high_mean": 0.0031806410988792777, "clip_ratio/low_mean": 0.0065914426813833416, "clip_ratio/low_min": 0.0065914426813833416, "clip_ratio/region_mean": 0.00977208383847028, "completions/clipped_ratio": 0.0, "completions/max_length": 988.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 959.5, "completions/mean_terminated_length": 959.5, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "entropy": 0.04673037910833955, "epoch": 1.6080000643200027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006991271395236254, "kl": 0.02722498879302293, "learning_rate": 7.462979999259992e-06, "loss": 0.0001, "num_tokens": 7185430.0, "reward": -0.12582403421401978, "reward_std": 6.500052452087402, "rewards/rollout_reward_func/mean": -0.12582403421401978, "rewards/rollout_reward_func/std": 6.5000529289245605, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.18764114379883, "sampling/sampling_logp_difference/mean": 0.3409353792667389, "step": 402, "step_time": 17.706873145005375 }, { "clip_ratio/high_max": 0.004423277277965099, "clip_ratio/high_mean": 0.004423277277965099, "clip_ratio/low_mean": 0.003974677558289841, "clip_ratio/low_min": 0.003974677558289841, "clip_ratio/region_mean": 0.008397954807151109, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 901.125, "completions/mean_terminated_length": 901.125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.04535967484116554, "epoch": 1.6120000644800024e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0045353504829108715, "kl": 0.02901931549422443, "learning_rate": 7.462979999255943e-06, "loss": 0.0001, "num_tokens": 7212738.0, "reward": 10.266290664672852, "reward_std": 24.348386764526367, "rewards/rollout_reward_func/mean": 10.266290664672852, "rewards/rollout_reward_func/std": 24.348386764526367, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.5, "sampling/sampling_logp_difference/mean": 0.34225383400917053, "step": 403, "step_time": 17.596100797010877 }, { "clip_ratio/high_max": 0.005410946236224845, "clip_ratio/high_mean": 0.005410946236224845, "clip_ratio/low_mean": 0.004876147722825408, "clip_ratio/low_min": 0.004876147722825408, "clip_ratio/region_mean": 0.0102870938135311, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 972.9375, "completions/mean_terminated_length": 972.9375, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "entropy": 0.04565545590594411, "epoch": 1.6160000646400025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00907870102673769, "kl": 0.03054471663199365, "learning_rate": 7.462979999251883e-06, "loss": 0.0001, "num_tokens": 7241217.0, "reward": 5.655145645141602, "reward_std": 13.975126266479492, "rewards/rollout_reward_func/mean": 5.655145645141602, "rewards/rollout_reward_func/std": 13.975127220153809, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.21886444091797, "sampling/sampling_logp_difference/mean": 0.33868473768234253, "step": 404, "step_time": 18.169078835977416 }, { "clip_ratio/high_max": 0.003548805194441229, "clip_ratio/high_mean": 0.003548805194441229, "clip_ratio/low_mean": 0.006393872608896345, "clip_ratio/low_min": 0.006393872608896345, "clip_ratio/region_mean": 0.009942677861545235, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 969.5625, "completions/mean_terminated_length": 969.5625, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "entropy": 0.04664909141138196, "epoch": 1.6200000648000026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008852638304233551, "kl": 0.032187198754400015, "learning_rate": 7.462979999247812e-06, "loss": 0.0002, "num_tokens": 7269645.0, "reward": 1.1856698989868164, "reward_std": 8.007182121276855, "rewards/rollout_reward_func/mean": 1.1856698989868164, "rewards/rollout_reward_func/std": 8.007182121276855, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.9375, "sampling/sampling_logp_difference/mean": 0.33650892972946167, "step": 405, "step_time": 17.852312136019464 }, { "clip_ratio/high_max": 0.005663186602760106, "clip_ratio/high_mean": 0.005663186602760106, "clip_ratio/low_mean": 0.005143869260791689, "clip_ratio/low_min": 0.005143869260791689, "clip_ratio/region_mean": 0.010807055863551795, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 948.1875, "completions/mean_terminated_length": 948.1875, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "entropy": 0.04909295868128538, "epoch": 1.6240000649600027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01462565641850233, "kl": 0.03155928896740079, "learning_rate": 7.462979999243729e-06, "loss": 0.0001, "num_tokens": 7297707.0, "reward": -0.44646885991096497, "reward_std": 6.805074214935303, "rewards/rollout_reward_func/mean": -0.44646885991096497, "rewards/rollout_reward_func/std": 6.805074214935303, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.109375, "sampling/sampling_logp_difference/mean": 0.35080063343048096, "step": 406, "step_time": 17.850183697999455 }, { "clip_ratio/high_max": 0.002543809765484184, "clip_ratio/high_mean": 0.002543809765484184, "clip_ratio/low_mean": 0.0070423616271000355, "clip_ratio/low_min": 0.0070423616271000355, "clip_ratio/region_mean": 0.009586171479895711, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 940.75, "completions/mean_terminated_length": 940.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.04832548089325428, "epoch": 1.6280000651200028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036807542201131582, "kl": 0.02918064850382507, "learning_rate": 7.462979999239635e-06, "loss": 0.0001, "num_tokens": 7325686.0, "reward": 3.112384796142578, "reward_std": 26.595373153686523, "rewards/rollout_reward_func/mean": 3.112384796142578, "rewards/rollout_reward_func/std": 26.595373153686523, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.59375, "sampling/sampling_logp_difference/mean": 0.33568161725997925, "step": 407, "step_time": 17.74714902300184 }, { "clip_ratio/high_max": 0.006099258112953976, "clip_ratio/high_mean": 0.006099258112953976, "clip_ratio/low_mean": 0.0025111709837801754, "clip_ratio/low_min": 0.0025111709837801754, "clip_ratio/region_mean": 0.008610429125837982, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 950.4375, "completions/mean_terminated_length": 950.4375, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "entropy": 0.048388725612312555, "epoch": 1.6320000652800025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030542216263711452, "kl": 0.02533158496953547, "learning_rate": 7.4629799992355306e-06, "loss": 0.0001, "num_tokens": 7353789.0, "reward": 1.3891171216964722, "reward_std": 11.154633522033691, "rewards/rollout_reward_func/mean": 1.3891171216964722, "rewards/rollout_reward_func/std": 11.154633522033691, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.0, "sampling/sampling_logp_difference/mean": 0.33241477608680725, "step": 408, "step_time": 17.614655429999402 }, { "clip_ratio/high_max": 0.004806660290341824, "clip_ratio/high_mean": 0.004806660290341824, "clip_ratio/low_mean": 0.005886980186915025, "clip_ratio/low_min": 0.005886980186915025, "clip_ratio/region_mean": 0.010693640448153019, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 955.75, "completions/mean_terminated_length": 955.75, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "entropy": 0.04545213235542178, "epoch": 1.6360000654400026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003298990661278367, "kl": 0.027669462142512202, "learning_rate": 7.462979999231415e-06, "loss": 0.0001, "num_tokens": 7381982.0, "reward": 4.055266380310059, "reward_std": 18.664302825927734, "rewards/rollout_reward_func/mean": 4.055266380310059, "rewards/rollout_reward_func/std": 18.664302825927734, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.609375, "sampling/sampling_logp_difference/mean": 0.3594960868358612, "step": 409, "step_time": 17.780007804998604 }, { "clip_ratio/high_max": 0.005019544216338545, "clip_ratio/high_mean": 0.005019544216338545, "clip_ratio/low_mean": 0.00489452519104816, "clip_ratio/low_min": 0.00489452519104816, "clip_ratio/region_mean": 0.009914069320075214, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 965.625, "completions/mean_terminated_length": 965.625, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "entropy": 0.04562258021906018, "epoch": 1.6400000656000027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037478068843483925, "kl": 0.0279210414737463, "learning_rate": 7.462979999227289e-06, "loss": 0.0001, "num_tokens": 7410332.0, "reward": 2.8704514503479004, "reward_std": 14.69623851776123, "rewards/rollout_reward_func/mean": 2.8704514503479004, "rewards/rollout_reward_func/std": 14.696240425109863, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.796875, "sampling/sampling_logp_difference/mean": 0.3371056318283081, "step": 410, "step_time": 17.784790331003023 }, { "clip_ratio/high_max": 0.004159574134973809, "clip_ratio/high_mean": 0.004159574134973809, "clip_ratio/low_mean": 0.004931365547236055, "clip_ratio/low_min": 0.004931365547236055, "clip_ratio/region_mean": 0.009090939653106034, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 957.125, "completions/mean_terminated_length": 957.125, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "entropy": 0.04865519609302282, "epoch": 1.6440000657600027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003963853232562542, "kl": 0.026308433152735233, "learning_rate": 7.462979999223151e-06, "loss": 0.0001, "num_tokens": 7438561.0, "reward": 9.230756759643555, "reward_std": 17.888219833374023, "rewards/rollout_reward_func/mean": 9.230756759643555, "rewards/rollout_reward_func/std": 17.888219833374023, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.75390625, "sampling/sampling_logp_difference/mean": 0.3390613794326782, "step": 411, "step_time": 17.835369220993016 }, { "clip_ratio/high_max": 0.004630759038263932, "clip_ratio/high_mean": 0.004630759038263932, "clip_ratio/low_mean": 0.005474812962347642, "clip_ratio/low_min": 0.005474812962347642, "clip_ratio/region_mean": 0.010105572000611573, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 982.5, "completions/mean_terminated_length": 982.5, "completions/min_length": 942.0, "completions/min_terminated_length": 942.0, "entropy": 0.04447039868682623, "epoch": 1.6480000659200025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037777405232191086, "kl": 0.02827038848772645, "learning_rate": 7.462979999219002e-06, "loss": 0.0001, "num_tokens": 7467202.0, "reward": 2.3077664375305176, "reward_std": 20.691072463989258, "rewards/rollout_reward_func/mean": 2.3077664375305176, "rewards/rollout_reward_func/std": 20.69107437133789, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.4375, "sampling/sampling_logp_difference/mean": 0.33338552713394165, "step": 412, "step_time": 17.95695283699024 }, { "clip_ratio/high_max": 0.004992259258870035, "clip_ratio/high_mean": 0.004992259258870035, "clip_ratio/low_mean": 0.004464808938791975, "clip_ratio/low_min": 0.004464808938791975, "clip_ratio/region_mean": 0.00945706816855818, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 934.4375, "completions/mean_terminated_length": 934.4375, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "entropy": 0.04898903239518404, "epoch": 1.6520000660800026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033516010735183954, "kl": 0.023707238025963306, "learning_rate": 7.462979999214843e-06, "loss": 0.0001, "num_tokens": 7495027.0, "reward": 1.1502294540405273, "reward_std": 11.693382263183594, "rewards/rollout_reward_func/mean": 1.1502294540405273, "rewards/rollout_reward_func/std": 11.69338321685791, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.04695510864258, "sampling/sampling_logp_difference/mean": 0.3436765670776367, "step": 413, "step_time": 17.672391884996614 }, { "clip_ratio/high_max": 0.003448835137533024, "clip_ratio/high_mean": 0.003448835137533024, "clip_ratio/low_mean": 0.006062106782337651, "clip_ratio/low_min": 0.006062106782337651, "clip_ratio/region_mean": 0.009510941861663014, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 958.5625, "completions/mean_terminated_length": 958.5625, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "entropy": 0.047623359598219395, "epoch": 1.6560000662400026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003189076902344823, "kl": 0.024161669658496976, "learning_rate": 7.462979999210672e-06, "loss": 0.0001, "num_tokens": 7523251.0, "reward": 5.109198570251465, "reward_std": 14.637730598449707, "rewards/rollout_reward_func/mean": 5.109198570251465, "rewards/rollout_reward_func/std": 14.637730598449707, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.75, "sampling/sampling_logp_difference/mean": 0.32509756088256836, "step": 414, "step_time": 18.063460611003393 }, { "clip_ratio/high_max": 0.0028787296905647963, "clip_ratio/high_mean": 0.0028787296905647963, "clip_ratio/low_mean": 0.006989867310039699, "clip_ratio/low_min": 0.006989867310039699, "clip_ratio/region_mean": 0.009868596913293004, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 910.25, "completions/mean_terminated_length": 910.25, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.04713264433667064, "epoch": 1.6600000664000027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.03405157849192619, "kl": 0.03758587595075369, "learning_rate": 7.462979999206491e-06, "loss": 0.0002, "num_tokens": 7550710.0, "reward": 5.928678035736084, "reward_std": 25.502016067504883, "rewards/rollout_reward_func/mean": 5.928678035736084, "rewards/rollout_reward_func/std": 25.502017974853516, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.2578125, "sampling/sampling_logp_difference/mean": 0.3427675664424896, "step": 415, "step_time": 17.51557031300763 }, { "clip_ratio/high_max": 0.0048779040516819805, "clip_ratio/high_mean": 0.0048779040516819805, "clip_ratio/low_mean": 0.00491243033320643, "clip_ratio/low_min": 0.00491243033320643, "clip_ratio/region_mean": 0.00979033432668075, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 958.125, "completions/mean_terminated_length": 958.125, "completions/min_length": 909.0, "completions/min_terminated_length": 909.0, "entropy": 0.04586570104584098, "epoch": 1.6640000665600028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002668438944965601, "kl": 0.021874118130654097, "learning_rate": 7.462979999202296e-06, "loss": 0.0001, "num_tokens": 7578948.0, "reward": 4.693153381347656, "reward_std": 12.751203536987305, "rewards/rollout_reward_func/mean": 4.693153381347656, "rewards/rollout_reward_func/std": 12.751202583312988, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.765625, "sampling/sampling_logp_difference/mean": 0.33364513516426086, "step": 416, "step_time": 17.83519918200909 }, { "clip_ratio/high_max": 0.005484365945449099, "clip_ratio/high_mean": 0.005484365945449099, "clip_ratio/low_mean": 0.0038328175141941756, "clip_ratio/low_min": 0.0038328175141941756, "clip_ratio/region_mean": 0.009317183517850935, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 954.0625, "completions/mean_terminated_length": 954.0625, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "entropy": 0.04592451313510537, "epoch": 1.6680000667200026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002373620169237256, "kl": 0.02273173606954515, "learning_rate": 7.462979999198093e-06, "loss": 0.0001, "num_tokens": 7607112.0, "reward": 0.07785463333129883, "reward_std": 8.733792304992676, "rewards/rollout_reward_func/mean": 0.07785463333129883, "rewards/rollout_reward_func/std": 8.73379135131836, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.0, "sampling/sampling_logp_difference/mean": 0.34270694851875305, "step": 417, "step_time": 17.815113208016555 }, { "clip_ratio/high_max": 0.006260539637878537, "clip_ratio/high_mean": 0.006260539637878537, "clip_ratio/low_mean": 0.004393030452774838, "clip_ratio/low_min": 0.004393030452774838, "clip_ratio/region_mean": 0.010653570061549544, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 955.625, "completions/mean_terminated_length": 955.625, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "entropy": 0.04391592741012573, "epoch": 1.6720000668800026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024793429765850306, "kl": 0.021179570234380662, "learning_rate": 7.4629799991938786e-06, "loss": 0.0001, "num_tokens": 7635294.0, "reward": -2.308037519454956, "reward_std": 7.0721282958984375, "rewards/rollout_reward_func/mean": -2.308037519454956, "rewards/rollout_reward_func/std": 7.072127819061279, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.8203125, "sampling/sampling_logp_difference/mean": 0.339396208524704, "step": 418, "step_time": 17.855951685000036 }, { "clip_ratio/high_max": 0.0034460721362847835, "clip_ratio/high_mean": 0.0034460721362847835, "clip_ratio/low_mean": 0.007006821862887591, "clip_ratio/low_min": 0.007006821862887591, "clip_ratio/region_mean": 0.010452894028276205, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 938.5, "completions/mean_terminated_length": 938.5, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "entropy": 0.04522094363346696, "epoch": 1.6760000670400027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004615001380443573, "kl": 0.025567762553691864, "learning_rate": 7.462979999189653e-06, "loss": 0.0001, "num_tokens": 7663190.0, "reward": 3.681119441986084, "reward_std": 10.175195693969727, "rewards/rollout_reward_func/mean": 3.681119441986084, "rewards/rollout_reward_func/std": 10.175195693969727, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.9375, "sampling/sampling_logp_difference/mean": 0.353962779045105, "step": 419, "step_time": 18.147358648988302 }, { "clip_ratio/high_max": 0.006565779272932559, "clip_ratio/high_mean": 0.006565779272932559, "clip_ratio/low_mean": 0.00446302501950413, "clip_ratio/low_min": 0.00446302501950413, "clip_ratio/region_mean": 0.01102880429243669, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 945.375, "completions/mean_terminated_length": 945.375, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "entropy": 0.044680547434836626, "epoch": 1.6800000672000028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01473651360720396, "kl": 0.03305017785169184, "learning_rate": 7.462979999185414e-06, "loss": 0.0002, "num_tokens": 7691190.0, "reward": 2.9798994064331055, "reward_std": 13.001416206359863, "rewards/rollout_reward_func/mean": 2.9798994064331055, "rewards/rollout_reward_func/std": 13.001416206359863, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.3125, "sampling/sampling_logp_difference/mean": 0.3470442295074463, "step": 420, "step_time": 18.12841343400214 }, { "clip_ratio/high_max": 0.005341217969544232, "clip_ratio/high_mean": 0.005341217969544232, "clip_ratio/low_mean": 0.00598863500636071, "clip_ratio/low_min": 0.00598863500636071, "clip_ratio/region_mean": 0.011329853092320263, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 944.4375, "completions/mean_terminated_length": 944.4375, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "entropy": 0.045396854635328054, "epoch": 1.6840000673600025e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004685628227889538, "kl": 0.02789842220954597, "learning_rate": 7.462979999181166e-06, "loss": 0.0001, "num_tokens": 7719204.0, "reward": -0.2885214388370514, "reward_std": 9.539775848388672, "rewards/rollout_reward_func/mean": -0.2885214388370514, "rewards/rollout_reward_func/std": 9.539775848388672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.688873291015625, "sampling/sampling_logp_difference/mean": 0.3592691421508789, "step": 421, "step_time": 18.216612892007106 }, { "clip_ratio/high_max": 0.004486447665840387, "clip_ratio/high_mean": 0.004486447665840387, "clip_ratio/low_mean": 0.005205746623687446, "clip_ratio/low_min": 0.005205746623687446, "clip_ratio/region_mean": 0.009692194347735494, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 978.25, "completions/mean_terminated_length": 978.25, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "entropy": 0.045246563851833344, "epoch": 1.6880000675200026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010281442664563656, "kl": 0.02592077129520476, "learning_rate": 7.462979999176907e-06, "loss": 0.0001, "num_tokens": 7747794.0, "reward": 1.7286442518234253, "reward_std": 13.336620330810547, "rewards/rollout_reward_func/mean": 1.7286442518234253, "rewards/rollout_reward_func/std": 13.336620330810547, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.5, "sampling/sampling_logp_difference/mean": 0.32948827743530273, "step": 422, "step_time": 18.293466226998135 }, { "clip_ratio/high_max": 0.005900482123252004, "clip_ratio/high_mean": 0.005900482123252004, "clip_ratio/low_mean": 0.003909821418346837, "clip_ratio/low_min": 0.003909821418346837, "clip_ratio/region_mean": 0.009810303570702672, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 974.0625, "completions/mean_terminated_length": 974.0625, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "entropy": 0.045859151519834995, "epoch": 1.6920000676800027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011273573152720928, "kl": 0.026299505028873682, "learning_rate": 7.4629799991726375e-06, "loss": 0.0001, "num_tokens": 7776313.0, "reward": 4.244367599487305, "reward_std": 14.25993537902832, "rewards/rollout_reward_func/mean": 4.244367599487305, "rewards/rollout_reward_func/std": 14.259936332702637, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.50004959106445, "sampling/sampling_logp_difference/mean": 0.35095030069351196, "step": 423, "step_time": 17.82013041499158 }, { "clip_ratio/high_max": 0.004268012475222349, "clip_ratio/high_mean": 0.004268012475222349, "clip_ratio/low_mean": 0.005264263862045482, "clip_ratio/low_min": 0.005264263862045482, "clip_ratio/region_mean": 0.009532276482786983, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 961.75, "completions/mean_terminated_length": 961.75, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "entropy": 0.04504662612453103, "epoch": 1.6960000678400028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028099112678319216, "kl": 0.022825870662927628, "learning_rate": 7.462979999168357e-06, "loss": 0.0001, "num_tokens": 7804600.0, "reward": -1.155574083328247, "reward_std": 7.463468074798584, "rewards/rollout_reward_func/mean": -1.155574083328247, "rewards/rollout_reward_func/std": 7.463468074798584, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.53125, "sampling/sampling_logp_difference/mean": 0.3407798409461975, "step": 424, "step_time": 17.965580528994906 }, { "clip_ratio/high_max": 0.0035412071156315506, "clip_ratio/high_mean": 0.0035412071156315506, "clip_ratio/low_mean": 0.006408757559256628, "clip_ratio/low_min": 0.006408757559256628, "clip_ratio/region_mean": 0.00994996470399201, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 980.1875, "completions/mean_terminated_length": 980.1875, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "entropy": 0.0426293583586812, "epoch": 1.700000068000003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005956360138952732, "kl": 0.018043380579911172, "learning_rate": 7.462979999164064e-06, "loss": 0.0001, "num_tokens": 7833199.0, "reward": -2.0612030029296875, "reward_std": 10.196694374084473, "rewards/rollout_reward_func/mean": -2.0612030029296875, "rewards/rollout_reward_func/std": 10.196694374084473, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.21875, "sampling/sampling_logp_difference/mean": 0.33736100792884827, "step": 425, "step_time": 18.22011047801061 }, { "clip_ratio/high_max": 0.004051145340781659, "clip_ratio/high_mean": 0.004051145340781659, "clip_ratio/low_mean": 0.005330277708708309, "clip_ratio/low_min": 0.005330277708708309, "clip_ratio/region_mean": 0.009381423122249544, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 982.6875, "completions/mean_terminated_length": 982.6875, "completions/min_length": 940.0, "completions/min_terminated_length": 940.0, "entropy": 0.04569529974833131, "epoch": 1.7040000681600026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002259480068460107, "kl": 0.019726274185813963, "learning_rate": 7.46297999915976e-06, "loss": 0.0001, "num_tokens": 7861854.0, "reward": 5.026876449584961, "reward_std": 14.149430274963379, "rewards/rollout_reward_func/mean": 5.026876449584961, "rewards/rollout_reward_func/std": 14.149430274963379, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.25, "sampling/sampling_logp_difference/mean": 0.33225345611572266, "step": 426, "step_time": 18.4024765279828 }, { "clip_ratio/high_max": 0.003870715299854055, "clip_ratio/high_mean": 0.003870715299854055, "clip_ratio/low_mean": 0.005166841670870781, "clip_ratio/low_min": 0.005166841670870781, "clip_ratio/region_mean": 0.009037556766998023, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 967.875, "completions/mean_terminated_length": 967.875, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "entropy": 0.048069811426103115, "epoch": 1.7080000683200027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026075120549649, "kl": 0.02178021497093141, "learning_rate": 7.462979999155445e-06, "loss": 0.0001, "num_tokens": 7890248.0, "reward": 0.7446079254150391, "reward_std": 5.762397289276123, "rewards/rollout_reward_func/mean": 0.7446079254150391, "rewards/rollout_reward_func/std": 5.762397289276123, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.36328125, "sampling/sampling_logp_difference/mean": 0.3357990086078644, "step": 427, "step_time": 18.233247552998364 }, { "clip_ratio/high_max": 0.0034744209769996814, "clip_ratio/high_mean": 0.0034744209769996814, "clip_ratio/low_mean": 0.006763169600162655, "clip_ratio/low_min": 0.006763169600162655, "clip_ratio/region_mean": 0.010237590526230633, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 874.0, "completions/mean_terminated_length": 874.0, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.04647931735962629, "epoch": 1.7120000684800028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00526908366009593, "kl": 0.020641003153286874, "learning_rate": 7.46297999915112e-06, "loss": 0.0001, "num_tokens": 7917128.0, "reward": 14.210506439208984, "reward_std": 30.330928802490234, "rewards/rollout_reward_func/mean": 14.210506439208984, "rewards/rollout_reward_func/std": 30.330930709838867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.375003814697266, "sampling/sampling_logp_difference/mean": 0.3599327504634857, "step": 428, "step_time": 17.489390133989218 }, { "clip_ratio/high_max": 0.0027913515805266798, "clip_ratio/high_mean": 0.0027913515805266798, "clip_ratio/low_mean": 0.008293878519907594, "clip_ratio/low_min": 0.008293878519907594, "clip_ratio/region_mean": 0.011085230158641934, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 921.625, "completions/mean_terminated_length": 921.625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.04713734891265631, "epoch": 1.716000068640003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004416835028678179, "kl": 0.024491722462698817, "learning_rate": 7.4629799991467835e-06, "loss": 0.0001, "num_tokens": 7944786.0, "reward": 10.619565963745117, "reward_std": 27.00373077392578, "rewards/rollout_reward_func/mean": 10.619565963745117, "rewards/rollout_reward_func/std": 27.003732681274414, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.906253814697266, "sampling/sampling_logp_difference/mean": 0.3560481667518616, "step": 429, "step_time": 18.072951626010763 }, { "clip_ratio/high_max": 0.005467381270136684, "clip_ratio/high_mean": 0.005467381270136684, "clip_ratio/low_mean": 0.004367929475847632, "clip_ratio/low_min": 0.004367929475847632, "clip_ratio/region_mean": 0.009835310745984316, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 986.875, "completions/mean_terminated_length": 986.875, "completions/min_length": 904.0, "completions/min_terminated_length": 904.0, "entropy": 0.04390808194875717, "epoch": 1.7200000688000026e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.014946931973099709, "kl": 0.02574493200518191, "learning_rate": 7.4629799991424366e-06, "loss": 0.0001, "num_tokens": 7973523.0, "reward": 1.5927479267120361, "reward_std": 9.88015365600586, "rewards/rollout_reward_func/mean": 1.5927479267120361, "rewards/rollout_reward_func/std": 9.880154609680176, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.76563262939453, "sampling/sampling_logp_difference/mean": 0.3467543423175812, "step": 430, "step_time": 18.189531807001913 }, { "clip_ratio/high_max": 0.005881503922864795, "clip_ratio/high_mean": 0.005881503922864795, "clip_ratio/low_mean": 0.0053701792494393885, "clip_ratio/low_min": 0.0053701792494393885, "clip_ratio/region_mean": 0.011251683172304183, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 947.6875, "completions/mean_terminated_length": 947.6875, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "entropy": 0.04740228969603777, "epoch": 1.7240000689600027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006270736455917358, "kl": 0.025689115864224732, "learning_rate": 7.462979999138078e-06, "loss": 0.0001, "num_tokens": 8001570.0, "reward": 3.8503799438476562, "reward_std": 16.46632957458496, "rewards/rollout_reward_func/mean": 3.8503799438476562, "rewards/rollout_reward_func/std": 16.46632957458496, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 34.375003814697266, "sampling/sampling_logp_difference/mean": 0.35021644830703735, "step": 431, "step_time": 17.902839354996104 }, { "clip_ratio/high_max": 0.004858252301346511, "clip_ratio/high_mean": 0.004858252301346511, "clip_ratio/low_mean": 0.005370564525946975, "clip_ratio/low_min": 0.005370564525946975, "clip_ratio/region_mean": 0.010228816943708807, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 936.6875, "completions/mean_terminated_length": 936.6875, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "entropy": 0.04582975339144468, "epoch": 1.7280000691200027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02042463794350624, "kl": 0.025507798418402672, "learning_rate": 7.462979999133708e-06, "loss": 0.0001, "num_tokens": 8029430.0, "reward": 1.616222620010376, "reward_std": 6.943825721740723, "rewards/rollout_reward_func/mean": 1.616222620010376, "rewards/rollout_reward_func/std": 6.943825721740723, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.1875, "sampling/sampling_logp_difference/mean": 0.3620619475841522, "step": 432, "step_time": 17.82234053198772 }, { "clip_ratio/high_max": 0.004914301709504798, "clip_ratio/high_mean": 0.004914301709504798, "clip_ratio/low_mean": 0.005054319713963196, "clip_ratio/low_min": 0.005054319713963196, "clip_ratio/region_mean": 0.009968621365260333, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 959.4375, "completions/mean_terminated_length": 959.4375, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "entropy": 0.045173972845077515, "epoch": 1.7320000692800028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029681087471544743, "kl": 0.0194023794028908, "learning_rate": 7.462979999129327e-06, "loss": 0.0001, "num_tokens": 8057680.0, "reward": -0.32063913345336914, "reward_std": 8.867982864379883, "rewards/rollout_reward_func/mean": -0.32063913345336914, "rewards/rollout_reward_func/std": 8.867982864379883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.65625, "sampling/sampling_logp_difference/mean": 0.3441246747970581, "step": 433, "step_time": 17.753443853005592 }, { "clip_ratio/high_max": 0.005517622455954552, "clip_ratio/high_mean": 0.005517622455954552, "clip_ratio/low_mean": 0.006021311040967703, "clip_ratio/low_min": 0.006021311040967703, "clip_ratio/region_mean": 0.011538933380506933, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 960.875, "completions/mean_terminated_length": 960.875, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "entropy": 0.0450088232755661, "epoch": 1.736000069440003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010772068984806538, "kl": 0.026692708721384406, "learning_rate": 7.462979999124936e-06, "loss": 0.0001, "num_tokens": 8085952.0, "reward": 2.8475255966186523, "reward_std": 14.243352890014648, "rewards/rollout_reward_func/mean": 2.8475255966186523, "rewards/rollout_reward_func/std": 14.243353843688965, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.3125, "sampling/sampling_logp_difference/mean": 0.3530057370662689, "step": 434, "step_time": 18.112410286994418 }, { "clip_ratio/high_max": 0.004857783671468496, "clip_ratio/high_mean": 0.004857783671468496, "clip_ratio/low_mean": 0.005143365997355431, "clip_ratio/low_min": 0.005143365997355431, "clip_ratio/region_mean": 0.010001149727031589, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 968.125, "completions/mean_terminated_length": 968.125, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "entropy": 0.04550622357055545, "epoch": 1.7400000696000027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026552865747362375, "kl": 0.019769119564443827, "learning_rate": 7.462979999120533e-06, "loss": 0.0001, "num_tokens": 8114348.0, "reward": -0.5327123403549194, "reward_std": 9.76424789428711, "rewards/rollout_reward_func/mean": -0.5327123403549194, "rewards/rollout_reward_func/std": 9.76424789428711, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.73438262939453, "sampling/sampling_logp_difference/mean": 0.3540739119052887, "step": 435, "step_time": 17.834762133003096 }, { "clip_ratio/high_max": 0.004894531681202352, "clip_ratio/high_mean": 0.004894531681202352, "clip_ratio/low_mean": 0.0065053598082158715, "clip_ratio/low_min": 0.0065053598082158715, "clip_ratio/region_mean": 0.011399891518522054, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 921.75, "completions/mean_terminated_length": 921.75, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "entropy": 0.04686395404860377, "epoch": 1.7440000697600027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0046363635919988155, "kl": 0.02198812598362565, "learning_rate": 7.462979999116118e-06, "loss": 0.0001, "num_tokens": 8141946.0, "reward": 6.19476842880249, "reward_std": 13.337325096130371, "rewards/rollout_reward_func/mean": 6.19476842880249, "rewards/rollout_reward_func/std": 13.337324142456055, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.875, "sampling/sampling_logp_difference/mean": 0.3546735942363739, "step": 436, "step_time": 17.588084583992895 }, { "clip_ratio/high_max": 0.003995571285486221, "clip_ratio/high_mean": 0.003995571285486221, "clip_ratio/low_mean": 0.005582322686677799, "clip_ratio/low_min": 0.005582322686677799, "clip_ratio/region_mean": 0.009577894117683172, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 966.9375, "completions/mean_terminated_length": 966.9375, "completions/min_length": 937.0, "completions/min_terminated_length": 937.0, "entropy": 0.0454350421205163, "epoch": 1.7480000699200028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031619358342140913, "kl": 0.020604847697541118, "learning_rate": 7.4629799991116935e-06, "loss": 0.0001, "num_tokens": 8170316.0, "reward": -6.564638137817383, "reward_std": 12.48405933380127, "rewards/rollout_reward_func/mean": -6.564638137817383, "rewards/rollout_reward_func/std": 12.484060287475586, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.375, "sampling/sampling_logp_difference/mean": 0.3577566146850586, "step": 437, "step_time": 18.049320338999678 }, { "clip_ratio/high_max": 0.00445223503629677, "clip_ratio/high_mean": 0.00445223503629677, "clip_ratio/low_mean": 0.004889167568762787, "clip_ratio/low_min": 0.004889167568762787, "clip_ratio/region_mean": 0.009341402561403811, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 930.75, "completions/mean_terminated_length": 930.75, "completions/min_length": 904.0, "completions/min_terminated_length": 904.0, "entropy": 0.044981985818594694, "epoch": 1.752000070080003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027449363842606544, "kl": 0.01970094896387309, "learning_rate": 7.462979999107258e-06, "loss": 0.0001, "num_tokens": 8198073.0, "reward": 1.2148313522338867, "reward_std": 7.559000492095947, "rewards/rollout_reward_func/mean": 1.2148313522338867, "rewards/rollout_reward_func/std": 7.5590009689331055, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.5, "sampling/sampling_logp_difference/mean": 0.36245250701904297, "step": 438, "step_time": 17.5158103710055 }, { "clip_ratio/high_max": 0.005427411757409573, "clip_ratio/high_mean": 0.005427411757409573, "clip_ratio/low_mean": 0.0048037905362434685, "clip_ratio/low_min": 0.0048037905362434685, "clip_ratio/region_mean": 0.01023120217723772, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 984.25, "completions/mean_terminated_length": 984.25, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "entropy": 0.045437251683324575, "epoch": 1.756000070240003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004166632890701294, "kl": 0.022363438620232046, "learning_rate": 7.46297999910281e-06, "loss": 0.0001, "num_tokens": 8226750.0, "reward": 1.1350761651992798, "reward_std": 17.97592544555664, "rewards/rollout_reward_func/mean": 1.1350761651992798, "rewards/rollout_reward_func/std": 17.97592544555664, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.125030517578125, "sampling/sampling_logp_difference/mean": 0.3370591104030609, "step": 439, "step_time": 18.20101639199129 }, { "clip_ratio/high_max": 0.0043774434889201075, "clip_ratio/high_mean": 0.0043774434889201075, "clip_ratio/low_mean": 0.004238664638251066, "clip_ratio/low_min": 0.004238664638251066, "clip_ratio/region_mean": 0.008616108156275004, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 972.1875, "completions/mean_terminated_length": 972.1875, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "entropy": 0.04338043974712491, "epoch": 1.7600000704000027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0020374993328005075, "kl": 0.016497994191013277, "learning_rate": 7.462979999098352e-06, "loss": 0.0001, "num_tokens": 8255221.0, "reward": -0.1508834958076477, "reward_std": 8.643309593200684, "rewards/rollout_reward_func/mean": -0.1508834958076477, "rewards/rollout_reward_func/std": 8.643310546875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.25, "sampling/sampling_logp_difference/mean": 0.34613069891929626, "step": 440, "step_time": 17.8439600930069 }, { "clip_ratio/high_max": 0.0036915026721544564, "clip_ratio/high_mean": 0.0036915026721544564, "clip_ratio/low_mean": 0.005836785421706736, "clip_ratio/low_min": 0.005836785421706736, "clip_ratio/region_mean": 0.009528288093861192, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 978.3125, "completions/mean_terminated_length": 978.3125, "completions/min_length": 933.0, "completions/min_terminated_length": 933.0, "entropy": 0.04377039801329374, "epoch": 1.7640000705600028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005848707631230354, "kl": 0.024731424055062234, "learning_rate": 7.462979999093883e-06, "loss": 0.0001, "num_tokens": 8283798.0, "reward": 9.718060493469238, "reward_std": 16.132905960083008, "rewards/rollout_reward_func/mean": 9.718060493469238, "rewards/rollout_reward_func/std": 16.132905960083008, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.875, "sampling/sampling_logp_difference/mean": 0.3526565730571747, "step": 441, "step_time": 18.08864014899882 }, { "clip_ratio/high_max": 0.006940801162272692, "clip_ratio/high_mean": 0.006940801162272692, "clip_ratio/low_mean": 0.00269160361494869, "clip_ratio/low_min": 0.00269160361494869, "clip_ratio/region_mean": 0.009632404893636703, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 973.5, "completions/mean_terminated_length": 973.5, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "entropy": 0.04324194975197315, "epoch": 1.768000070720003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0077522448264062405, "kl": 0.02217978541739285, "learning_rate": 7.462979999089403e-06, "loss": 0.0001, "num_tokens": 8312291.0, "reward": 4.167507648468018, "reward_std": 10.039299011230469, "rewards/rollout_reward_func/mean": 4.167507648468018, "rewards/rollout_reward_func/std": 10.039299964904785, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.59375, "sampling/sampling_logp_difference/mean": 0.34817931056022644, "step": 442, "step_time": 18.13564182700793 }, { "clip_ratio/high_max": 0.00475018925499171, "clip_ratio/high_mean": 0.00475018925499171, "clip_ratio/low_mean": 0.005983482726151124, "clip_ratio/low_min": 0.005983482726151124, "clip_ratio/region_mean": 0.010733671952039003, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 954.125, "completions/mean_terminated_length": 954.125, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "entropy": 0.04629321862012148, "epoch": 1.772000070880003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031103864312171936, "kl": 0.018873468041419983, "learning_rate": 7.462979999084912e-06, "loss": 0.0001, "num_tokens": 8340461.0, "reward": 0.04980626702308655, "reward_std": 8.0564546585083, "rewards/rollout_reward_func/mean": 0.04980626702308655, "rewards/rollout_reward_func/std": 8.0564546585083, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.125, "sampling/sampling_logp_difference/mean": 0.35109758377075195, "step": 443, "step_time": 17.79104778799956 }, { "clip_ratio/high_max": 0.0049432513187639415, "clip_ratio/high_mean": 0.0049432513187639415, "clip_ratio/low_mean": 0.005112170736538246, "clip_ratio/low_min": 0.005112170736538246, "clip_ratio/region_mean": 0.010055421851575375, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 963.875, "completions/mean_terminated_length": 963.875, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "entropy": 0.044987133238464594, "epoch": 1.7760000710400027e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.018807362765073776, "kl": 0.02459625550545752, "learning_rate": 7.462979999080409e-06, "loss": 0.0001, "num_tokens": 8368799.0, "reward": 4.270215034484863, "reward_std": 12.30504322052002, "rewards/rollout_reward_func/mean": 4.270215034484863, "rewards/rollout_reward_func/std": 12.305044174194336, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.625, "sampling/sampling_logp_difference/mean": 0.3516300320625305, "step": 444, "step_time": 18.256967170003918 }, { "clip_ratio/high_max": 0.0034467510995455086, "clip_ratio/high_mean": 0.0034467510995455086, "clip_ratio/low_mean": 0.007267554407007992, "clip_ratio/low_min": 0.007267554407007992, "clip_ratio/region_mean": 0.010714305564761162, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 923.6875, "completions/mean_terminated_length": 923.6875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.04400517791509628, "epoch": 1.7800000712000028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004891954828053713, "kl": 0.023247156874276698, "learning_rate": 7.462979999075897e-06, "loss": 0.0001, "num_tokens": 8396472.0, "reward": 9.352254867553711, "reward_std": 25.304744720458984, "rewards/rollout_reward_func/mean": 9.352254867553711, "rewards/rollout_reward_func/std": 25.304744720458984, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.062503814697266, "sampling/sampling_logp_difference/mean": 0.3603116571903229, "step": 445, "step_time": 17.74707283700991 }, { "clip_ratio/high_max": 0.006442215555580333, "clip_ratio/high_mean": 0.006442215555580333, "clip_ratio/low_mean": 0.0034566302492748946, "clip_ratio/low_min": 0.0034566302492748946, "clip_ratio/region_mean": 0.009898845863062888, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 978.125, "completions/mean_terminated_length": 978.125, "completions/min_length": 952.0, "completions/min_terminated_length": 952.0, "entropy": 0.04768166830763221, "epoch": 1.784000071360003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035273379180580378, "kl": 0.018486115150153637, "learning_rate": 7.462979999071372e-06, "loss": 0.0001, "num_tokens": 8425057.0, "reward": -3.2364614009857178, "reward_std": 15.547107696533203, "rewards/rollout_reward_func/mean": -3.2364614009857178, "rewards/rollout_reward_func/std": 15.547107696533203, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.1875, "sampling/sampling_logp_difference/mean": 0.33240485191345215, "step": 446, "step_time": 18.257891935012594 }, { "clip_ratio/high_max": 0.004976120835635811, "clip_ratio/high_mean": 0.004976120835635811, "clip_ratio/low_mean": 0.005575424118433148, "clip_ratio/low_min": 0.005575424118433148, "clip_ratio/region_mean": 0.010551544837653637, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 954.875, "completions/mean_terminated_length": 954.875, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "entropy": 0.04443748760968447, "epoch": 1.788000071520003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004727366380393505, "kl": 0.026774691767059267, "learning_rate": 7.462979999066836e-06, "loss": 0.0001, "num_tokens": 8453223.0, "reward": 1.9211621284484863, "reward_std": 9.705361366271973, "rewards/rollout_reward_func/mean": 1.9211621284484863, "rewards/rollout_reward_func/std": 9.705361366271973, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.3125, "sampling/sampling_logp_difference/mean": 0.3584659993648529, "step": 447, "step_time": 17.950098186993273 }, { "clip_ratio/high_max": 0.004623494052793831, "clip_ratio/high_mean": 0.004623494052793831, "clip_ratio/low_mean": 0.004722042329376563, "clip_ratio/low_min": 0.004722042329376563, "clip_ratio/region_mean": 0.009345536294858903, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 958.875, "completions/mean_terminated_length": 958.875, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "entropy": 0.04681519977748394, "epoch": 1.792000071680003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004780458752065897, "kl": 0.023274607490748167, "learning_rate": 7.46297999906229e-06, "loss": 0.0001, "num_tokens": 8481452.0, "reward": 0.7647130489349365, "reward_std": 12.174566268920898, "rewards/rollout_reward_func/mean": 0.7647130489349365, "rewards/rollout_reward_func/std": 12.174567222595215, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 37.062503814697266, "sampling/sampling_logp_difference/mean": 0.35861143469810486, "step": 448, "step_time": 18.16590464799083 }, { "clip_ratio/high_max": 0.0050574447377584875, "clip_ratio/high_mean": 0.0050574447377584875, "clip_ratio/low_mean": 0.004977207514457405, "clip_ratio/low_min": 0.004977207514457405, "clip_ratio/region_mean": 0.010034652252215892, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 961.8125, "completions/mean_terminated_length": 961.8125, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "entropy": 0.04569733841344714, "epoch": 1.7960000718400028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003341987729072571, "kl": 0.02244951855391264, "learning_rate": 7.462979999057733e-06, "loss": 0.0001, "num_tokens": 8509760.0, "reward": 3.6284170150756836, "reward_std": 11.343659400939941, "rewards/rollout_reward_func/mean": 3.6284170150756836, "rewards/rollout_reward_func/std": 11.343660354614258, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.03125, "sampling/sampling_logp_difference/mean": 0.35450297594070435, "step": 449, "step_time": 17.96738990299491 }, { "clip_ratio/high_max": 0.0036100136931054294, "clip_ratio/high_mean": 0.0036100136931054294, "clip_ratio/low_mean": 0.0056954567262437195, "clip_ratio/low_min": 0.0056954567262437195, "clip_ratio/region_mean": 0.00930547044845298, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 952.5, "completions/mean_terminated_length": 952.5, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "entropy": 0.04558962257578969, "epoch": 1.800000072000003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028156291227787733, "kl": 0.020100743742659688, "learning_rate": 7.462979999053164e-06, "loss": 0.0001, "num_tokens": 8537894.0, "reward": 0.4982149600982666, "reward_std": 11.322041511535645, "rewards/rollout_reward_func/mean": 0.4982149600982666, "rewards/rollout_reward_func/std": 11.322041511535645, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.03135681152344, "sampling/sampling_logp_difference/mean": 0.3474365174770355, "step": 450, "step_time": 17.737529727004585 }, { "clip_ratio/high_max": 0.005176999082323164, "clip_ratio/high_mean": 0.005176999082323164, "clip_ratio/low_mean": 0.005761145323049277, "clip_ratio/low_min": 0.005761145323049277, "clip_ratio/region_mean": 0.01093814440537244, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 988.875, "completions/mean_terminated_length": 988.875, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "entropy": 0.043583931401371956, "epoch": 1.804000072160003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0051505910232663155, "kl": 0.022753614000976086, "learning_rate": 7.4629799990485845e-06, "loss": 0.0001, "num_tokens": 8566669.0, "reward": 1.9072232246398926, "reward_std": 10.104990005493164, "rewards/rollout_reward_func/mean": 1.9072232246398926, "rewards/rollout_reward_func/std": 10.104990005493164, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.85157012939453, "sampling/sampling_logp_difference/mean": 0.3319372534751892, "step": 451, "step_time": 18.1486412919985 }, { "clip_ratio/high_max": 0.0051239728345535696, "clip_ratio/high_mean": 0.0051239728345535696, "clip_ratio/low_mean": 0.004945238382788375, "clip_ratio/low_min": 0.004945238382788375, "clip_ratio/region_mean": 0.010069211246445775, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 975.3125, "completions/mean_terminated_length": 975.3125, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "entropy": 0.04436769289895892, "epoch": 1.808000072320003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.050481121987104416, "kl": 0.06132241268642247, "learning_rate": 7.4629799990439935e-06, "loss": 0.0003, "num_tokens": 8595203.0, "reward": 4.193799018859863, "reward_std": 9.804801940917969, "rewards/rollout_reward_func/mean": 4.193799018859863, "rewards/rollout_reward_func/std": 9.804802894592285, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.796875, "sampling/sampling_logp_difference/mean": 0.34578126668930054, "step": 452, "step_time": 18.033287162987108 }, { "clip_ratio/high_max": 0.004160901764407754, "clip_ratio/high_mean": 0.004160901764407754, "clip_ratio/low_mean": 0.005775547935627401, "clip_ratio/low_min": 0.005775547935627401, "clip_ratio/region_mean": 0.009936449758242816, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 921.8125, "completions/mean_terminated_length": 921.8125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.04633008781820536, "epoch": 1.8120000724800028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024300364311784506, "kl": 0.019632055249530822, "learning_rate": 7.4629799990393925e-06, "loss": 0.0001, "num_tokens": 8622866.0, "reward": 9.712772369384766, "reward_std": 21.18081283569336, "rewards/rollout_reward_func/mean": 9.712772369384766, "rewards/rollout_reward_func/std": 21.180810928344727, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.31352615356445, "sampling/sampling_logp_difference/mean": 0.3390547037124634, "step": 453, "step_time": 17.67909964201681 }, { "clip_ratio/high_max": 0.003544425097061321, "clip_ratio/high_mean": 0.003544425097061321, "clip_ratio/low_mean": 0.005913334141951054, "clip_ratio/low_min": 0.005913334141951054, "clip_ratio/region_mean": 0.009457759093493223, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 938.5, "completions/mean_terminated_length": 938.5, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "entropy": 0.04777897894382477, "epoch": 1.816000072640003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00769722880795598, "kl": 0.026782545493915677, "learning_rate": 7.4629799990347795e-06, "loss": 0.0001, "num_tokens": 8650752.0, "reward": 1.0050911903381348, "reward_std": 5.45257568359375, "rewards/rollout_reward_func/mean": 1.0050911903381348, "rewards/rollout_reward_func/std": 5.452576160430908, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.171878814697266, "sampling/sampling_logp_difference/mean": 0.3473791778087616, "step": 454, "step_time": 17.713879042988992 }, { "clip_ratio/high_max": 0.003857038274873048, "clip_ratio/high_mean": 0.003857038274873048, "clip_ratio/low_mean": 0.005732764257118106, "clip_ratio/low_min": 0.005732764257118106, "clip_ratio/region_mean": 0.009589802531991154, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 959.8125, "completions/mean_terminated_length": 959.8125, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "entropy": 0.04715270921587944, "epoch": 1.820000072800003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036478221882134676, "kl": 0.02177404670510441, "learning_rate": 7.4629799990301556e-06, "loss": 0.0001, "num_tokens": 8679020.0, "reward": 3.362204074859619, "reward_std": 9.831576347351074, "rewards/rollout_reward_func/mean": 3.362204074859619, "rewards/rollout_reward_func/std": 9.831575393676758, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.94538116455078, "sampling/sampling_logp_difference/mean": 0.354550838470459, "step": 455, "step_time": 17.89295210500859 }, { "clip_ratio/high_max": 0.005507000139914453, "clip_ratio/high_mean": 0.005507000139914453, "clip_ratio/low_mean": 0.005093118088552728, "clip_ratio/low_min": 0.005093118088552728, "clip_ratio/region_mean": 0.01060011814115569, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 968.4375, "completions/mean_terminated_length": 968.4375, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "entropy": 0.04512876691296697, "epoch": 1.824000072960003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036676174495369196, "kl": 0.02260570856742561, "learning_rate": 7.462979999025521e-06, "loss": 0.0001, "num_tokens": 8707421.0, "reward": 0.5072296857833862, "reward_std": 9.974767684936523, "rewards/rollout_reward_func/mean": 0.5072296857833862, "rewards/rollout_reward_func/std": 9.97476863861084, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.78125, "sampling/sampling_logp_difference/mean": 0.35151374340057373, "step": 456, "step_time": 18.09125287099596 }, { "clip_ratio/high_max": 0.006184746918734163, "clip_ratio/high_mean": 0.006184746918734163, "clip_ratio/low_mean": 0.004821597802219912, "clip_ratio/low_min": 0.004821597802219912, "clip_ratio/region_mean": 0.011006344691850245, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 958.9375, "completions/mean_terminated_length": 958.9375, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "entropy": 0.042448163498193026, "epoch": 1.828000073120003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002897644881159067, "kl": 0.022084422525949776, "learning_rate": 7.4629799990208755e-06, "loss": 0.0001, "num_tokens": 8735666.0, "reward": 0.13352704048156738, "reward_std": 7.621073246002197, "rewards/rollout_reward_func/mean": 0.13352704048156738, "rewards/rollout_reward_func/std": 7.621073246002197, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.609375, "sampling/sampling_logp_difference/mean": 0.3450094163417816, "step": 457, "step_time": 17.702564504012116 }, { "clip_ratio/high_max": 0.0034005217894446105, "clip_ratio/high_mean": 0.0034005217894446105, "clip_ratio/low_mean": 0.005647320445859805, "clip_ratio/low_min": 0.005647320445859805, "clip_ratio/region_mean": 0.009047842118889093, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 915.8125, "completions/mean_terminated_length": 915.8125, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "entropy": 0.0465502655133605, "epoch": 1.8320000732800028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0063810888677835464, "kl": 0.024574294686317444, "learning_rate": 7.4629799990162185e-06, "loss": 0.0001, "num_tokens": 8763235.0, "reward": 5.868980407714844, "reward_std": 27.195289611816406, "rewards/rollout_reward_func/mean": 5.868980407714844, "rewards/rollout_reward_func/std": 27.195289611816406, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.25, "sampling/sampling_logp_difference/mean": 0.35241103172302246, "step": 458, "step_time": 17.902457977994345 }, { "clip_ratio/high_max": 0.005206904432270676, "clip_ratio/high_mean": 0.005206904432270676, "clip_ratio/low_mean": 0.0046889231307432055, "clip_ratio/low_min": 0.0046889231307432055, "clip_ratio/region_mean": 0.00989582750480622, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 989.5, "completions/mean_terminated_length": 989.5, "completions/min_length": 942.0, "completions/min_terminated_length": 942.0, "entropy": 0.04421739932149649, "epoch": 1.836000073440003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003917745314538479, "kl": 0.021232625702396035, "learning_rate": 7.4629799990115505e-06, "loss": 0.0001, "num_tokens": 8792001.0, "reward": 2.7605385780334473, "reward_std": 6.124969959259033, "rewards/rollout_reward_func/mean": 2.7605385780334473, "rewards/rollout_reward_func/std": 6.12497091293335, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.250003814697266, "sampling/sampling_logp_difference/mean": 0.3475848138332367, "step": 459, "step_time": 18.183852980000665 }, { "clip_ratio/high_max": 0.003818581986706704, "clip_ratio/high_mean": 0.003818581986706704, "clip_ratio/low_mean": 0.005212793039390817, "clip_ratio/low_min": 0.005212793039390817, "clip_ratio/region_mean": 0.009031375113409013, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 966.0, "completions/mean_terminated_length": 966.0, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "entropy": 0.04542060196399689, "epoch": 1.840000073600003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004264036193490028, "kl": 0.022605729987844825, "learning_rate": 7.4629799990068715e-06, "loss": 0.0001, "num_tokens": 8820364.0, "reward": 5.1285600662231445, "reward_std": 11.757745742797852, "rewards/rollout_reward_func/mean": 5.1285600662231445, "rewards/rollout_reward_func/std": 11.757745742797852, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.015625, "sampling/sampling_logp_difference/mean": 0.3378565013408661, "step": 460, "step_time": 17.831976779008983 }, { "clip_ratio/high_max": 0.006953082687687129, "clip_ratio/high_mean": 0.006953082687687129, "clip_ratio/low_mean": 0.005014069931348786, "clip_ratio/low_min": 0.005014069931348786, "clip_ratio/region_mean": 0.011967152648139745, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 902.6875, "completions/mean_terminated_length": 902.6875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.04626606870442629, "epoch": 1.844000073760003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0039329733699560165, "kl": 0.02237188338767737, "learning_rate": 7.46297999900218e-06, "loss": 0.0001, "num_tokens": 8847680.0, "reward": 10.851908683776855, "reward_std": 26.467445373535156, "rewards/rollout_reward_func/mean": 10.851908683776855, "rewards/rollout_reward_func/std": 26.46744728088379, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.8125, "sampling/sampling_logp_difference/mean": 0.3617394268512726, "step": 461, "step_time": 17.592876217000594 }, { "clip_ratio/high_max": 0.004724599129986018, "clip_ratio/high_mean": 0.004724599129986018, "clip_ratio/low_mean": 0.005003833764931187, "clip_ratio/low_min": 0.005003833764931187, "clip_ratio/region_mean": 0.009728432982228696, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 959.0, "completions/mean_terminated_length": 959.0, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "entropy": 0.04613220039755106, "epoch": 1.8480000739200028e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034861527383327484, "kl": 0.021663793362677097, "learning_rate": 7.4629799989974804e-06, "loss": 0.0001, "num_tokens": 8875912.0, "reward": 2.184990406036377, "reward_std": 10.018134117126465, "rewards/rollout_reward_func/mean": 2.184990406036377, "rewards/rollout_reward_func/std": 10.018135070800781, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.312522888183594, "sampling/sampling_logp_difference/mean": 0.352043092250824, "step": 462, "step_time": 17.860447689978173 }, { "clip_ratio/high_max": 0.006160841614473611, "clip_ratio/high_mean": 0.006160841614473611, "clip_ratio/low_mean": 0.004316095379181206, "clip_ratio/low_min": 0.004316095379181206, "clip_ratio/region_mean": 0.010476937226485461, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 960.0625, "completions/mean_terminated_length": 960.0625, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "entropy": 0.044893944170325994, "epoch": 1.852000074080003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006658511236310005, "kl": 0.02461204258725047, "learning_rate": 7.4629799989927675e-06, "loss": 0.0001, "num_tokens": 8904169.0, "reward": 1.1027697324752808, "reward_std": 13.680204391479492, "rewards/rollout_reward_func/mean": 1.1027697324752808, "rewards/rollout_reward_func/std": 13.680205345153809, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.6875, "sampling/sampling_logp_difference/mean": 0.34764358401298523, "step": 463, "step_time": 17.79471921201184 }, { "clip_ratio/high_max": 0.007075339060975239, "clip_ratio/high_mean": 0.007075339060975239, "clip_ratio/low_mean": 0.003564299229765311, "clip_ratio/low_min": 0.003564299229765311, "clip_ratio/region_mean": 0.01063963834894821, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 974.1875, "completions/mean_terminated_length": 974.1875, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "entropy": 0.04574886662885547, "epoch": 1.856000074240003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030323867686092854, "kl": 0.01956815249286592, "learning_rate": 7.4629799989880445e-06, "loss": 0.0001, "num_tokens": 8932668.0, "reward": 2.679905414581299, "reward_std": 12.471952438354492, "rewards/rollout_reward_func/mean": 2.679905414581299, "rewards/rollout_reward_func/std": 12.471952438354492, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.25000762939453, "sampling/sampling_logp_difference/mean": 0.3405258357524872, "step": 464, "step_time": 17.82830331999139 }, { "clip_ratio/high_max": 0.0050091061857528985, "clip_ratio/high_mean": 0.0050091061857528985, "clip_ratio/low_mean": 0.006061458436306566, "clip_ratio/low_min": 0.006061458436306566, "clip_ratio/region_mean": 0.011070564680267125, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 959.75, "completions/mean_terminated_length": 959.75, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "entropy": 0.04767734371125698, "epoch": 1.860000074400003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005730677396059036, "kl": 0.019960437319241464, "learning_rate": 7.4629799989833096e-06, "loss": 0.0001, "num_tokens": 8960936.0, "reward": 2.824253559112549, "reward_std": 17.043109893798828, "rewards/rollout_reward_func/mean": 2.824253559112549, "rewards/rollout_reward_func/std": 17.043109893798828, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 35.50026321411133, "sampling/sampling_logp_difference/mean": 0.35631710290908813, "step": 465, "step_time": 17.66381221800839 }, { "clip_ratio/high_max": 0.006045208836439997, "clip_ratio/high_mean": 0.006045208836439997, "clip_ratio/low_mean": 0.005330376123310998, "clip_ratio/low_min": 0.005330376123310998, "clip_ratio/region_mean": 0.011375584988854825, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 961.0, "completions/mean_terminated_length": 961.0, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "entropy": 0.044997429475188255, "epoch": 1.864000074560003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027696331962943077, "kl": 0.020412554149515927, "learning_rate": 7.4629799989785645e-06, "loss": 0.0001, "num_tokens": 8989217.0, "reward": 0.8317772150039673, "reward_std": 10.556432723999023, "rewards/rollout_reward_func/mean": 0.8317772150039673, "rewards/rollout_reward_func/std": 10.55643367767334, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.9375, "sampling/sampling_logp_difference/mean": 0.341600626707077, "step": 466, "step_time": 17.865711949001707 }, { "clip_ratio/high_max": 0.004223875788738951, "clip_ratio/high_mean": 0.004223875788738951, "clip_ratio/low_mean": 0.004934507160214707, "clip_ratio/low_min": 0.004934507160214707, "clip_ratio/region_mean": 0.009158382890745997, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 944.9375, "completions/mean_terminated_length": 944.9375, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "entropy": 0.04515645420178771, "epoch": 1.868000074720003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004629611968994141, "kl": 0.0218158372445032, "learning_rate": 7.462979998973808e-06, "loss": 0.0001, "num_tokens": 9017223.0, "reward": 0.17805194854736328, "reward_std": 5.663906574249268, "rewards/rollout_reward_func/mean": 0.17805194854736328, "rewards/rollout_reward_func/std": 5.663906097412109, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.328125, "sampling/sampling_logp_difference/mean": 0.3580007553100586, "step": 467, "step_time": 17.87628013900394 }, { "clip_ratio/high_max": 0.007075943518429995, "clip_ratio/high_mean": 0.007075943518429995, "clip_ratio/low_mean": 0.00464954788913019, "clip_ratio/low_min": 0.00464954788913019, "clip_ratio/region_mean": 0.011725491378456354, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 981.625, "completions/mean_terminated_length": 981.625, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "entropy": 0.044368558563292027, "epoch": 1.872000074880003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00367966596968472, "kl": 0.02059552853461355, "learning_rate": 7.462979998969041e-06, "loss": 0.0001, "num_tokens": 9045854.0, "reward": 6.425475120544434, "reward_std": 14.721678733825684, "rewards/rollout_reward_func/mean": 6.425475120544434, "rewards/rollout_reward_func/std": 14.721678733825684, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.28125, "sampling/sampling_logp_difference/mean": 0.342896968126297, "step": 468, "step_time": 18.061332078003034 }, { "clip_ratio/high_max": 0.007057052891468629, "clip_ratio/high_mean": 0.007057052891468629, "clip_ratio/low_mean": 0.004081782506546006, "clip_ratio/low_min": 0.004081782506546006, "clip_ratio/region_mean": 0.011138835456222296, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 981.8125, "completions/mean_terminated_length": 981.8125, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "entropy": 0.0431697703897953, "epoch": 1.876000075040003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007076679263263941, "kl": 0.0215745271416381, "learning_rate": 7.462979998964262e-06, "loss": 0.0001, "num_tokens": 9074487.0, "reward": -3.8618268966674805, "reward_std": 10.274144172668457, "rewards/rollout_reward_func/mean": -3.8618268966674805, "rewards/rollout_reward_func/std": 10.274144172668457, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.390625, "sampling/sampling_logp_difference/mean": 0.33478352427482605, "step": 469, "step_time": 17.768850963009754 }, { "clip_ratio/high_max": 0.003511879884172231, "clip_ratio/high_mean": 0.003511879884172231, "clip_ratio/low_mean": 0.006035339494701475, "clip_ratio/low_min": 0.006035339494701475, "clip_ratio/region_mean": 0.009547219378873706, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 980.9375, "completions/mean_terminated_length": 980.9375, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "entropy": 0.042814023327082396, "epoch": 1.880000075200003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.06748253107070923, "kl": 0.08528465882409364, "learning_rate": 7.462979998959473e-06, "loss": 0.0004, "num_tokens": 9103116.0, "reward": 6.192269325256348, "reward_std": 12.277271270751953, "rewards/rollout_reward_func/mean": 6.192269325256348, "rewards/rollout_reward_func/std": 12.277271270751953, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.125, "sampling/sampling_logp_difference/mean": 0.3464241325855255, "step": 470, "step_time": 17.911092172005738 }, { "clip_ratio/high_max": 0.005524300911929458, "clip_ratio/high_mean": 0.005524300911929458, "clip_ratio/low_mean": 0.00486396107589826, "clip_ratio/low_min": 0.00486396107589826, "clip_ratio/region_mean": 0.010388261987827718, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 961.125, "completions/mean_terminated_length": 961.125, "completions/min_length": 904.0, "completions/min_terminated_length": 904.0, "entropy": 0.04480479331687093, "epoch": 1.884000075360003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034785233438014984, "kl": 0.018260675366036594, "learning_rate": 7.462979998954672e-06, "loss": 0.0001, "num_tokens": 9131389.0, "reward": -2.3090851306915283, "reward_std": 6.0826849937438965, "rewards/rollout_reward_func/mean": -2.3090851306915283, "rewards/rollout_reward_func/std": 6.0826849937438965, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.5, "sampling/sampling_logp_difference/mean": 0.3544117212295532, "step": 471, "step_time": 17.827879456992378 }, { "clip_ratio/high_max": 0.004281631903722882, "clip_ratio/high_mean": 0.004281631903722882, "clip_ratio/low_mean": 0.005725635506678373, "clip_ratio/low_min": 0.005725635506678373, "clip_ratio/region_mean": 0.010007267293985933, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 976.0625, "completions/mean_terminated_length": 976.0625, "completions/min_length": 950.0, "completions/min_terminated_length": 950.0, "entropy": 0.047296736389398575, "epoch": 1.888000075520003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022295964881777763, "kl": 0.018805668922141194, "learning_rate": 7.462979998949861e-06, "loss": 0.0001, "num_tokens": 9159933.0, "reward": 3.3395557403564453, "reward_std": 15.541242599487305, "rewards/rollout_reward_func/mean": 3.3395557403564453, "rewards/rollout_reward_func/std": 15.541242599487305, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.1953125, "sampling/sampling_logp_difference/mean": 0.3360109329223633, "step": 472, "step_time": 17.759788415998628 }, { "clip_ratio/high_max": 0.005791740230051801, "clip_ratio/high_mean": 0.005791740230051801, "clip_ratio/low_mean": 0.00421445319079794, "clip_ratio/low_min": 0.00421445319079794, "clip_ratio/region_mean": 0.010006193304434419, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 953.1875, "completions/mean_terminated_length": 953.1875, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "entropy": 0.045548006426543, "epoch": 1.892000075680003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026024412363767624, "kl": 0.018330074730329216, "learning_rate": 7.462979998945038e-06, "loss": 0.0001, "num_tokens": 9188083.0, "reward": 0.7375516891479492, "reward_std": 3.774975299835205, "rewards/rollout_reward_func/mean": 0.7375516891479492, "rewards/rollout_reward_func/std": 3.774975299835205, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.140625, "sampling/sampling_logp_difference/mean": 0.3426235318183899, "step": 473, "step_time": 17.938560375987436 }, { "clip_ratio/high_max": 0.00523999345023185, "clip_ratio/high_mean": 0.00523999345023185, "clip_ratio/low_mean": 0.006127862317953259, "clip_ratio/low_min": 0.006127862317953259, "clip_ratio/region_mean": 0.01136785582639277, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 983.1875, "completions/mean_terminated_length": 983.1875, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "entropy": 0.04399797972291708, "epoch": 1.896000075840003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00277642416767776, "kl": 0.020340127404779196, "learning_rate": 7.462979998940204e-06, "loss": 0.0001, "num_tokens": 9216748.0, "reward": 1.5361425876617432, "reward_std": 10.642451286315918, "rewards/rollout_reward_func/mean": 1.5361425876617432, "rewards/rollout_reward_func/std": 10.642451286315918, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 36.25, "sampling/sampling_logp_difference/mean": 0.3454201817512512, "step": 474, "step_time": 17.74400728598266 }, { "clip_ratio/high_max": 0.006093973759561777, "clip_ratio/high_mean": 0.006093973759561777, "clip_ratio/low_mean": 0.0034781343128997833, "clip_ratio/low_min": 0.0034781343128997833, "clip_ratio/region_mean": 0.00957210810156539, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 979.375, "completions/mean_terminated_length": 979.375, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "entropy": 0.047481794375926256, "epoch": 1.9000000760000032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.017009686678647995, "kl": 0.023332121199928224, "learning_rate": 7.462979998935359e-06, "loss": 0.0001, "num_tokens": 9245341.0, "reward": 0.31474339962005615, "reward_std": 8.393532752990723, "rewards/rollout_reward_func/mean": 0.31474339962005615, "rewards/rollout_reward_func/std": 8.393532752990723, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.03125, "sampling/sampling_logp_difference/mean": 0.325614333152771, "step": 475, "step_time": 21.799918014003197 }, { "clip_ratio/high_max": 0.00422528779017739, "clip_ratio/high_mean": 0.00422528779017739, "clip_ratio/low_mean": 0.005690010177204385, "clip_ratio/low_min": 0.005690010177204385, "clip_ratio/region_mean": 0.009915297967381775, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 971.625, "completions/mean_terminated_length": 971.625, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "entropy": 0.04679801780730486, "epoch": 1.904000076160003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005205073859542608, "kl": 0.018819114891812205, "learning_rate": 7.462979998930503e-06, "loss": 0.0001, "num_tokens": 9273811.0, "reward": -0.09195995330810547, "reward_std": 9.66384220123291, "rewards/rollout_reward_func/mean": -0.09195995330810547, "rewards/rollout_reward_func/std": 9.66384220123291, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.75, "sampling/sampling_logp_difference/mean": 0.33941295742988586, "step": 476, "step_time": 18.123585361005098 }, { "clip_ratio/high_max": 0.0039351585728581995, "clip_ratio/high_mean": 0.0039351585728581995, "clip_ratio/low_mean": 0.006070958770578727, "clip_ratio/low_min": 0.006070958770578727, "clip_ratio/region_mean": 0.010006117459852248, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 895.25, "completions/mean_terminated_length": 895.25, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.04671099176630378, "epoch": 1.908000076320003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009364905767142773, "kl": 0.02703534415923059, "learning_rate": 7.462979998925636e-06, "loss": 0.0001, "num_tokens": 9300998.0, "reward": 10.39941120147705, "reward_std": 24.810529708862305, "rewards/rollout_reward_func/mean": 10.39941120147705, "rewards/rollout_reward_func/std": 24.810527801513672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.96875, "sampling/sampling_logp_difference/mean": 0.35658028721809387, "step": 477, "step_time": 17.266333473002305 }, { "clip_ratio/high_max": 0.0041532734176144, "clip_ratio/high_mean": 0.0041532734176144, "clip_ratio/low_mean": 0.005421009205747396, "clip_ratio/low_min": 0.005421009205747396, "clip_ratio/region_mean": 0.009574282623361796, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 962.0, "completions/mean_terminated_length": 962.0, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "entropy": 0.04471696773543954, "epoch": 1.912000076480003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005562166217714548, "kl": 0.01924496074207127, "learning_rate": 7.462979998920759e-06, "loss": 0.0001, "num_tokens": 9329298.0, "reward": 2.0551578998565674, "reward_std": 7.132735252380371, "rewards/rollout_reward_func/mean": 2.0551578998565674, "rewards/rollout_reward_func/std": 7.132735729217529, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.78908920288086, "sampling/sampling_logp_difference/mean": 0.34073248505592346, "step": 478, "step_time": 17.669108751004387 }, { "clip_ratio/high_max": 0.004491479572607204, "clip_ratio/high_mean": 0.004491479572607204, "clip_ratio/low_mean": 0.005359189788578078, "clip_ratio/low_min": 0.005359189788578078, "clip_ratio/region_mean": 0.009850669419392943, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 968.8125, "completions/mean_terminated_length": 968.8125, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "entropy": 0.04653573269024491, "epoch": 1.916000076640003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003949125297367573, "kl": 0.020944926654919982, "learning_rate": 7.46297999891587e-06, "loss": 0.0001, "num_tokens": 9357707.0, "reward": 4.312682151794434, "reward_std": 14.607549667358398, "rewards/rollout_reward_func/mean": 4.312682151794434, "rewards/rollout_reward_func/std": 14.607550621032715, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.640628814697266, "sampling/sampling_logp_difference/mean": 0.34041857719421387, "step": 479, "step_time": 17.689575012991554 }, { "clip_ratio/high_max": 0.0043010934023186564, "clip_ratio/high_mean": 0.0043010934023186564, "clip_ratio/low_mean": 0.005851588473888114, "clip_ratio/low_min": 0.005851588473888114, "clip_ratio/region_mean": 0.010152681963518262, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 941.8125, "completions/mean_terminated_length": 941.8125, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "entropy": 0.04503756761550903, "epoch": 1.9200000768000032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004486192017793655, "kl": 0.02380367077421397, "learning_rate": 7.4629799989109696e-06, "loss": 0.0001, "num_tokens": 9385651.0, "reward": -2.9352211952209473, "reward_std": 8.86149787902832, "rewards/rollout_reward_func/mean": -2.9352211952209473, "rewards/rollout_reward_func/std": 8.86149787902832, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.93752670288086, "sampling/sampling_logp_difference/mean": 0.3445151746273041, "step": 480, "step_time": 17.800377375002427 }, { "clip_ratio/high_max": 0.0032925939303822815, "clip_ratio/high_mean": 0.0032925939303822815, "clip_ratio/low_mean": 0.00525538349756971, "clip_ratio/low_min": 0.00525538349756971, "clip_ratio/region_mean": 0.00854797736974433, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 1236.75, "completions/mean_terminated_length": 1236.75, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.04166347160935402, "epoch": 1.924000076960003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011586531065404415, "kl": 0.02375549077987671, "learning_rate": 7.462979998906059e-06, "loss": 0.0001, "num_tokens": 9418398.0, "reward": 1.5124220848083496, "reward_std": 26.323020935058594, "rewards/rollout_reward_func/mean": 1.5124220848083496, "rewards/rollout_reward_func/std": 26.323022842407227, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.596927642822266, "sampling/sampling_logp_difference/mean": 0.29744958877563477, "step": 481, "step_time": 20.99859359300899 }, { "clip_ratio/high_max": 0.004074400552781299, "clip_ratio/high_mean": 0.004074400552781299, "clip_ratio/low_mean": 0.005023510733735748, "clip_ratio/low_min": 0.005023510733735748, "clip_ratio/region_mean": 0.009097911301068962, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 1225.8125, "completions/mean_terminated_length": 1225.8125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "entropy": 0.043089814484119415, "epoch": 1.928000077120003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006073178257793188, "kl": 0.01865870808251202, "learning_rate": 7.462979998901136e-06, "loss": 0.0001, "num_tokens": 9450944.0, "reward": 2.810795783996582, "reward_std": 29.615005493164062, "rewards/rollout_reward_func/mean": 2.810795783996582, "rewards/rollout_reward_func/std": 29.615005493164062, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.125, "sampling/sampling_logp_difference/mean": 0.28978103399276733, "step": 482, "step_time": 20.853138912003487 }, { "clip_ratio/high_max": 0.004531808866886422, "clip_ratio/high_mean": 0.004531808866886422, "clip_ratio/low_mean": 0.004026298061944544, "clip_ratio/low_min": 0.004026298061944544, "clip_ratio/region_mean": 0.008558107016142458, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 1281.875, "completions/mean_terminated_length": 1281.875, "completions/min_length": 1159.0, "completions/min_terminated_length": 1159.0, "entropy": 0.04380662413313985, "epoch": 1.932000077280003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.03133983165025711, "kl": 0.03028249880298972, "learning_rate": 7.462979998896204e-06, "loss": 0.0002, "num_tokens": 9484376.0, "reward": -6.582253456115723, "reward_std": 9.12781810760498, "rewards/rollout_reward_func/mean": -6.582253456115723, "rewards/rollout_reward_func/std": 9.12781810760498, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.71875, "sampling/sampling_logp_difference/mean": 0.30698493123054504, "step": 483, "step_time": 20.94910499699472 }, { "clip_ratio/high_max": 0.0039926211757119745, "clip_ratio/high_mean": 0.0039926211757119745, "clip_ratio/low_mean": 0.0037701359833590686, "clip_ratio/low_min": 0.0037701359833590686, "clip_ratio/region_mean": 0.007762757071759552, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 1283.625, "completions/mean_terminated_length": 1283.625, "completions/min_length": 1210.0, "completions/min_terminated_length": 1210.0, "entropy": 0.043060097843408585, "epoch": 1.9360000774400032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035864547826349735, "kl": 0.021081674261949956, "learning_rate": 7.462979998891259e-06, "loss": 0.0001, "num_tokens": 9517846.0, "reward": -3.8298606872558594, "reward_std": 9.91622543334961, "rewards/rollout_reward_func/mean": -3.8298606872558594, "rewards/rollout_reward_func/std": 9.916226387023926, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.21875, "sampling/sampling_logp_difference/mean": 0.30648693442344666, "step": 484, "step_time": 21.213987419985642 }, { "clip_ratio/high_max": 0.005368135229218751, "clip_ratio/high_mean": 0.005368135229218751, "clip_ratio/low_mean": 0.0040241481037810445, "clip_ratio/low_min": 0.0040241481037810445, "clip_ratio/region_mean": 0.009392283449415118, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 1252.8125, "completions/mean_terminated_length": 1252.8125, "completions/min_length": 1169.0, "completions/min_terminated_length": 1169.0, "entropy": 0.043346078135073185, "epoch": 1.940000077600003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003131216624751687, "kl": 0.01986404147464782, "learning_rate": 7.462979998886304e-06, "loss": 0.0001, "num_tokens": 9550802.0, "reward": -2.5811517238616943, "reward_std": 10.683445930480957, "rewards/rollout_reward_func/mean": -2.5811517238616943, "rewards/rollout_reward_func/std": 10.683445930480957, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.6406364440918, "sampling/sampling_logp_difference/mean": 0.3090743124485016, "step": 485, "step_time": 20.88306250599271 }, { "clip_ratio/high_max": 0.0026437901833560318, "clip_ratio/high_mean": 0.0026437901833560318, "clip_ratio/low_mean": 0.006102774903411046, "clip_ratio/low_min": 0.006102774903411046, "clip_ratio/region_mean": 0.008746565028559417, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 1181.6875, "completions/mean_terminated_length": 1181.6875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "entropy": 0.043362187687307596, "epoch": 1.944000077760003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033672810532152653, "kl": 0.01963152748066932, "learning_rate": 7.462979998881337e-06, "loss": 0.0001, "num_tokens": 9582620.0, "reward": 3.534945249557495, "reward_std": 25.417804718017578, "rewards/rollout_reward_func/mean": 3.534945249557495, "rewards/rollout_reward_func/std": 25.417804718017578, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.625, "sampling/sampling_logp_difference/mean": 0.3123548924922943, "step": 486, "step_time": 20.820900965001783 }, { "clip_ratio/high_max": 0.003973769140429795, "clip_ratio/high_mean": 0.003973769140429795, "clip_ratio/low_mean": 0.004271734855137765, "clip_ratio/low_min": 0.004271734855137765, "clip_ratio/region_mean": 0.008245504111982882, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 1269.625, "completions/mean_terminated_length": 1269.625, "completions/min_length": 1186.0, "completions/min_terminated_length": 1186.0, "entropy": 0.043006498366594315, "epoch": 1.948000077920003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034006517380476, "kl": 0.01883794041350484, "learning_rate": 7.462979998876361e-06, "loss": 0.0001, "num_tokens": 9615848.0, "reward": -5.397332191467285, "reward_std": 7.634355545043945, "rewards/rollout_reward_func/mean": -5.397332191467285, "rewards/rollout_reward_func/std": 7.634355545043945, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.171878814697266, "sampling/sampling_logp_difference/mean": 0.3024238646030426, "step": 487, "step_time": 21.020174651006528 }, { "clip_ratio/high_max": 0.005213877680944279, "clip_ratio/high_mean": 0.005213877680944279, "clip_ratio/low_mean": 0.0041133168851956725, "clip_ratio/low_min": 0.0041133168851956725, "clip_ratio/region_mean": 0.009327194595243782, "completions/clipped_ratio": 0.0, "completions/max_length": 1322.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 1252.6875, "completions/mean_terminated_length": 1252.6875, "completions/min_length": 1176.0, "completions/min_terminated_length": 1176.0, "entropy": 0.04382199654355645, "epoch": 1.9520000780800032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00434225145727396, "kl": 0.023977731121703982, "learning_rate": 7.4629799988713725e-06, "loss": 0.0001, "num_tokens": 9648790.0, "reward": -2.966677188873291, "reward_std": 8.818498611450195, "rewards/rollout_reward_func/mean": -2.966677188873291, "rewards/rollout_reward_func/std": 8.818499565124512, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.28125, "sampling/sampling_logp_difference/mean": 0.30878525972366333, "step": 488, "step_time": 20.952371059000143 }, { "clip_ratio/high_max": 0.005152724697836675, "clip_ratio/high_mean": 0.005152724697836675, "clip_ratio/low_mean": 0.0029532981279771775, "clip_ratio/low_min": 0.0029532981279771775, "clip_ratio/region_mean": 0.008106022898573428, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 1248.5, "completions/mean_terminated_length": 1248.5, "completions/min_length": 1229.0, "completions/min_terminated_length": 1229.0, "entropy": 0.0442354129627347, "epoch": 1.9560000782400033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010917332023382187, "kl": 0.023030847078189254, "learning_rate": 7.462979998866372e-06, "loss": 0.0001, "num_tokens": 9681660.0, "reward": -4.340898513793945, "reward_std": 9.700759887695312, "rewards/rollout_reward_func/mean": -4.340898513793945, "rewards/rollout_reward_func/std": 9.700760841369629, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.6875, "sampling/sampling_logp_difference/mean": 0.30845898389816284, "step": 489, "step_time": 20.125915327000257 }, { "clip_ratio/high_max": 0.005059683404397219, "clip_ratio/high_mean": 0.005059683404397219, "clip_ratio/low_mean": 0.003232522984035313, "clip_ratio/low_min": 0.003232522984035313, "clip_ratio/region_mean": 0.008292206446640193, "completions/clipped_ratio": 0.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 1222.0625, "completions/mean_terminated_length": 1222.0625, "completions/min_length": 1192.0, "completions/min_terminated_length": 1192.0, "entropy": 0.04369911411777139, "epoch": 1.960000078400003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028041142504662275, "kl": 0.018532401765696704, "learning_rate": 7.462979998861361e-06, "loss": 0.0001, "num_tokens": 9714100.0, "reward": -5.1969146728515625, "reward_std": 9.900035858154297, "rewards/rollout_reward_func/mean": -5.1969146728515625, "rewards/rollout_reward_func/std": 9.900035858154297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.25, "sampling/sampling_logp_difference/mean": 0.305032342672348, "step": 490, "step_time": 20.09284653099894 }, { "clip_ratio/high_max": 0.005343461409211159, "clip_ratio/high_mean": 0.005343461409211159, "clip_ratio/low_mean": 0.003691718098707497, "clip_ratio/low_min": 0.003691718098707497, "clip_ratio/region_mean": 0.009035179507918656, "completions/clipped_ratio": 0.0, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 1245.625, "completions/mean_terminated_length": 1245.625, "completions/min_length": 1173.0, "completions/min_terminated_length": 1173.0, "entropy": 0.04361470229923725, "epoch": 1.964000078560003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.10026780515909195, "kl": 0.03614890342578292, "learning_rate": 7.462979998856341e-06, "loss": 0.0002, "num_tokens": 9746934.0, "reward": -3.465593099594116, "reward_std": 6.890316963195801, "rewards/rollout_reward_func/mean": -3.465593099594116, "rewards/rollout_reward_func/std": 6.890316963195801, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.172237396240234, "sampling/sampling_logp_difference/mean": 0.3094227612018585, "step": 491, "step_time": 20.94773486199847 }, { "clip_ratio/high_max": 0.005076361994724721, "clip_ratio/high_mean": 0.005076361994724721, "clip_ratio/low_mean": 0.00429964458453469, "clip_ratio/low_min": 0.00429964458453469, "clip_ratio/region_mean": 0.00937600655015558, "completions/clipped_ratio": 0.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 1226.625, "completions/mean_terminated_length": 1226.625, "completions/min_length": 1168.0, "completions/min_terminated_length": 1168.0, "entropy": 0.04309246549382806, "epoch": 1.9680000787200032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0054536242969334126, "kl": 0.022490131435915828, "learning_rate": 7.462979998851308e-06, "loss": 0.0001, "num_tokens": 9779440.0, "reward": -2.3116798400878906, "reward_std": 16.498924255371094, "rewards/rollout_reward_func/mean": -2.3116798400878906, "rewards/rollout_reward_func/std": 16.498924255371094, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.437503814697266, "sampling/sampling_logp_difference/mean": 0.30777060985565186, "step": 492, "step_time": 20.940433880990895 }, { "clip_ratio/high_max": 0.003772142226807773, "clip_ratio/high_mean": 0.003772142226807773, "clip_ratio/low_mean": 0.0050584109558258206, "clip_ratio/low_min": 0.0050584109558258206, "clip_ratio/region_mean": 0.008830553153529763, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 1246.25, "completions/mean_terminated_length": 1246.25, "completions/min_length": 1175.0, "completions/min_terminated_length": 1175.0, "entropy": 0.04097805079072714, "epoch": 1.9720000788800033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010853483341634274, "kl": 0.020651204511523247, "learning_rate": 7.462979998846264e-06, "loss": 0.0001, "num_tokens": 9812285.0, "reward": -1.1360833644866943, "reward_std": 9.78809642791748, "rewards/rollout_reward_func/mean": -1.1360833644866943, "rewards/rollout_reward_func/std": 9.788097381591797, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.0, "sampling/sampling_logp_difference/mean": 0.3099733889102936, "step": 493, "step_time": 21.024477609011228 }, { "clip_ratio/high_max": 0.005222172185312957, "clip_ratio/high_mean": 0.005222172185312957, "clip_ratio/low_mean": 0.004659018479287624, "clip_ratio/low_min": 0.004659018479287624, "clip_ratio/region_mean": 0.009881190722808242, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/max_terminated_length": 1275.0, "completions/mean_length": 1204.8125, "completions/mean_terminated_length": 1204.8125, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "entropy": 0.04496691795065999, "epoch": 1.976000079040003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026703225448727608, "kl": 0.018260243232361972, "learning_rate": 7.46297999884121e-06, "loss": 0.0001, "num_tokens": 9844435.0, "reward": 2.679790735244751, "reward_std": 16.03082275390625, "rewards/rollout_reward_func/mean": 2.679790735244751, "rewards/rollout_reward_func/std": 16.03082275390625, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.3125, "sampling/sampling_logp_difference/mean": 0.3229120373725891, "step": 494, "step_time": 20.651448982003785 }, { "clip_ratio/high_max": 0.0027496289112605155, "clip_ratio/high_mean": 0.0027496289112605155, "clip_ratio/low_mean": 0.005577925941906869, "clip_ratio/low_min": 0.005577925941906869, "clip_ratio/region_mean": 0.008327554794959724, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 1212.75, "completions/mean_terminated_length": 1212.75, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.04247347917407751, "epoch": 1.980000079200003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009569249115884304, "kl": 0.02074545982759446, "learning_rate": 7.4629799988361435e-06, "loss": 0.0001, "num_tokens": 9876776.0, "reward": 5.984315872192383, "reward_std": 23.468257904052734, "rewards/rollout_reward_func/mean": 5.984315872192383, "rewards/rollout_reward_func/std": 23.468259811401367, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.8125, "sampling/sampling_logp_difference/mean": 0.32467108964920044, "step": 495, "step_time": 21.1153312339884 }, { "clip_ratio/high_max": 0.0027861494454555213, "clip_ratio/high_mean": 0.0027861494454555213, "clip_ratio/low_mean": 0.006656790501438081, "clip_ratio/low_min": 0.006656790501438081, "clip_ratio/region_mean": 0.009442940005101264, "completions/clipped_ratio": 0.0, "completions/max_length": 1328.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 1265.0, "completions/mean_terminated_length": 1265.0, "completions/min_length": 1153.0, "completions/min_terminated_length": 1153.0, "entropy": 0.04093753080815077, "epoch": 1.9840000793600032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003117650980129838, "kl": 0.016892201150767505, "learning_rate": 7.462979998831066e-06, "loss": 0.0001, "num_tokens": 9909930.0, "reward": -1.7662065029144287, "reward_std": 8.466126441955566, "rewards/rollout_reward_func/mean": -1.7662065029144287, "rewards/rollout_reward_func/std": 8.466126441955566, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.5625, "sampling/sampling_logp_difference/mean": 0.3102709650993347, "step": 496, "step_time": 21.123517020998406 }, { "clip_ratio/high_max": 0.004450104635907337, "clip_ratio/high_mean": 0.004450104635907337, "clip_ratio/low_mean": 0.004664712236262858, "clip_ratio/low_min": 0.004664712236262858, "clip_ratio/region_mean": 0.009114816843066365, "completions/clipped_ratio": 0.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 1301.75, "completions/mean_terminated_length": 1301.75, "completions/min_length": 1267.0, "completions/min_terminated_length": 1267.0, "entropy": 0.03977243788540363, "epoch": 1.9880000795200033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034201559610664845, "kl": 0.015476009575650096, "learning_rate": 7.462979998825979e-06, "loss": 0.0001, "num_tokens": 9943680.0, "reward": -5.489652633666992, "reward_std": 6.958258152008057, "rewards/rollout_reward_func/mean": -5.489652633666992, "rewards/rollout_reward_func/std": 6.958258628845215, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.6875, "sampling/sampling_logp_difference/mean": 0.3123524785041809, "step": 497, "step_time": 21.10731424397818 }, { "clip_ratio/high_max": 0.004944002546835691, "clip_ratio/high_mean": 0.004944002546835691, "clip_ratio/low_mean": 0.0042023865098599344, "clip_ratio/low_min": 0.0042023865098599344, "clip_ratio/region_mean": 0.009146389085799456, "completions/clipped_ratio": 0.0, "completions/max_length": 1328.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 1285.9375, "completions/mean_terminated_length": 1285.9375, "completions/min_length": 1236.0, "completions/min_terminated_length": 1236.0, "entropy": 0.039920030161738396, "epoch": 1.9920000796800033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005852879956364632, "kl": 0.017206937773153186, "learning_rate": 7.46297999882088e-06, "loss": 0.0001, "num_tokens": 9977192.0, "reward": -6.310553073883057, "reward_std": 10.19863224029541, "rewards/rollout_reward_func/mean": -6.310553073883057, "rewards/rollout_reward_func/std": 10.19863224029541, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.1328125, "sampling/sampling_logp_difference/mean": 0.3082577586174011, "step": 498, "step_time": 21.393087700016622 }, { "clip_ratio/high_max": 0.003418764434172772, "clip_ratio/high_mean": 0.003418764434172772, "clip_ratio/low_mean": 0.005879250704310834, "clip_ratio/low_min": 0.005879250704310834, "clip_ratio/region_mean": 0.009298015211243182, "completions/clipped_ratio": 0.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 1188.375, "completions/mean_terminated_length": 1188.375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.03975221561267972, "epoch": 1.996000079840003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026702044997364283, "kl": 0.018302207230590284, "learning_rate": 7.46297999881577e-06, "loss": 0.0001, "num_tokens": 10009097.0, "reward": 1.8728957176208496, "reward_std": 25.442890167236328, "rewards/rollout_reward_func/mean": 1.8728957176208496, "rewards/rollout_reward_func/std": 25.442890167236328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.59375, "sampling/sampling_logp_difference/mean": 0.32452133297920227, "step": 499, "step_time": 20.515539535990683 }, { "clip_ratio/high_max": 0.004347858688561246, "clip_ratio/high_mean": 0.004347858688561246, "clip_ratio/low_mean": 0.004875128128333017, "clip_ratio/low_min": 0.004875128128333017, "clip_ratio/region_mean": 0.00922298664227128, "completions/clipped_ratio": 0.0, "completions/max_length": 1273.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 1219.875, "completions/mean_terminated_length": 1219.875, "completions/min_length": 1190.0, "completions/min_terminated_length": 1190.0, "entropy": 0.04231398832052946, "epoch": 2.000000080000003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0070542520843446255, "kl": 0.017819955945014954, "learning_rate": 7.462979998810649e-06, "loss": 0.0001, "num_tokens": 10041495.0, "reward": -5.074420928955078, "reward_std": 9.318907737731934, "rewards/rollout_reward_func/mean": -5.074420928955078, "rewards/rollout_reward_func/std": 9.318907737731934, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.125, "sampling/sampling_logp_difference/mean": 0.3194105625152588, "step": 500, "step_time": 20.530813136007055 }, { "clip_ratio/high_max": 0.004501495190197602, "clip_ratio/high_mean": 0.004501495190197602, "clip_ratio/low_mean": 0.004086890257894993, "clip_ratio/low_min": 0.004086890257894993, "clip_ratio/region_mean": 0.008588385360781103, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 1244.3125, "completions/mean_terminated_length": 1244.3125, "completions/min_length": 1087.0, "completions/min_terminated_length": 1087.0, "entropy": 0.042055899277329445, "epoch": 2.0040000801600032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028306751046329737, "kl": 0.01664492953568697, "learning_rate": 7.462979998805517e-06, "loss": 0.0001, "num_tokens": 10074314.0, "reward": -2.175917148590088, "reward_std": 13.554153442382812, "rewards/rollout_reward_func/mean": -2.175917148590088, "rewards/rollout_reward_func/std": 13.554152488708496, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.62501525878906, "sampling/sampling_logp_difference/mean": 0.31684044003486633, "step": 501, "step_time": 21.26467006700841 }, { "clip_ratio/high_max": 0.004366524197394028, "clip_ratio/high_mean": 0.004366524197394028, "clip_ratio/low_mean": 0.0040266907890327275, "clip_ratio/low_min": 0.0040266907890327275, "clip_ratio/region_mean": 0.008393215073738247, "completions/clipped_ratio": 0.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 1269.25, "completions/mean_terminated_length": 1269.25, "completions/min_length": 1175.0, "completions/min_terminated_length": 1175.0, "entropy": 0.039672681130468845, "epoch": 2.0080000803200033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003612970234826207, "kl": 0.018224804778583348, "learning_rate": 7.462979998800373e-06, "loss": 0.0001, "num_tokens": 10107554.0, "reward": -1.9626891613006592, "reward_std": 11.55454158782959, "rewards/rollout_reward_func/mean": -1.9626891613006592, "rewards/rollout_reward_func/std": 11.55454158782959, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.03125, "sampling/sampling_logp_difference/mean": 0.3216320872306824, "step": 502, "step_time": 21.25918510601332 }, { "clip_ratio/high_max": 0.005566149746300653, "clip_ratio/high_mean": 0.005566149746300653, "clip_ratio/low_mean": 0.00437465391587466, "clip_ratio/low_min": 0.00437465391587466, "clip_ratio/region_mean": 0.00994080351665616, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 1294.875, "completions/mean_terminated_length": 1294.875, "completions/min_length": 1228.0, "completions/min_terminated_length": 1228.0, "entropy": 0.03865442890673876, "epoch": 2.012000080480003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.03194081783294678, "kl": 0.02130114787723869, "learning_rate": 7.46297999879522e-06, "loss": 0.0001, "num_tokens": 10141209.0, "reward": -3.410893678665161, "reward_std": 4.572993278503418, "rewards/rollout_reward_func/mean": -3.410893678665161, "rewards/rollout_reward_func/std": 4.572993755340576, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.062503814697266, "sampling/sampling_logp_difference/mean": 0.3045327663421631, "step": 503, "step_time": 21.327079856004275 }, { "clip_ratio/high_max": 0.005657991860061884, "clip_ratio/high_mean": 0.005657991860061884, "clip_ratio/low_mean": 0.0027192589768674225, "clip_ratio/low_min": 0.0027192589768674225, "clip_ratio/region_mean": 0.008377250691410154, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 1255.5, "completions/mean_terminated_length": 1255.5, "completions/min_length": 1164.0, "completions/min_terminated_length": 1164.0, "entropy": 0.04181271744892001, "epoch": 2.016000080640003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003939988557249308, "kl": 0.016771945054642856, "learning_rate": 7.462979998790055e-06, "loss": 0.0001, "num_tokens": 10174204.0, "reward": -3.5800859928131104, "reward_std": 15.495529174804688, "rewards/rollout_reward_func/mean": -3.5800859928131104, "rewards/rollout_reward_func/std": 15.495530128479004, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.71876525878906, "sampling/sampling_logp_difference/mean": 0.3189031779766083, "step": 504, "step_time": 21.18786385999556 }, { "clip_ratio/high_max": 0.006671620241831988, "clip_ratio/high_mean": 0.006671620241831988, "clip_ratio/low_mean": 0.003940763010177761, "clip_ratio/low_min": 0.003940763010177761, "clip_ratio/region_mean": 0.010612383193802088, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 1243.5625, "completions/mean_terminated_length": 1243.5625, "completions/min_length": 1186.0, "completions/min_terminated_length": 1186.0, "entropy": 0.04058253765106201, "epoch": 2.0200000808000032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031506051309406757, "kl": 0.0176966282306239, "learning_rate": 7.4629799987848785e-06, "loss": 0.0001, "num_tokens": 10206996.0, "reward": -7.377852439880371, "reward_std": 11.93869400024414, "rewards/rollout_reward_func/mean": -7.377852439880371, "rewards/rollout_reward_func/std": 11.93869400024414, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.3125, "sampling/sampling_logp_difference/mean": 0.32197701930999756, "step": 505, "step_time": 21.100999692003825 }, { "clip_ratio/high_max": 0.0038062995590735227, "clip_ratio/high_mean": 0.0038062995590735227, "clip_ratio/low_mean": 0.0062123381067067385, "clip_ratio/low_min": 0.0062123381067067385, "clip_ratio/region_mean": 0.010018637694884092, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 1244.0, "completions/mean_terminated_length": 1244.0, "completions/min_length": 1161.0, "completions/min_terminated_length": 1161.0, "entropy": 0.04178959969431162, "epoch": 2.0240000809600033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013431021012365818, "kl": 0.02256456227041781, "learning_rate": 7.462979998779691e-06, "loss": 0.0001, "num_tokens": 10239802.0, "reward": -1.7953616380691528, "reward_std": 6.232426643371582, "rewards/rollout_reward_func/mean": -1.7953616380691528, "rewards/rollout_reward_func/std": 6.232427597045898, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.71875, "sampling/sampling_logp_difference/mean": 0.32667556405067444, "step": 506, "step_time": 21.111633226995764 }, { "clip_ratio/high_max": 0.005058318929513916, "clip_ratio/high_mean": 0.005058318929513916, "clip_ratio/low_mean": 0.0032349159300792962, "clip_ratio/low_min": 0.0032349159300792962, "clip_ratio/region_mean": 0.008293234801385552, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 1265.4375, "completions/mean_terminated_length": 1265.4375, "completions/min_length": 1156.0, "completions/min_terminated_length": 1156.0, "entropy": 0.03919566096737981, "epoch": 2.0280000811200034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008757328614592552, "kl": 0.017913484480232, "learning_rate": 7.462979998774492e-06, "loss": 0.0001, "num_tokens": 10272947.0, "reward": -6.292183876037598, "reward_std": 6.724213600158691, "rewards/rollout_reward_func/mean": -6.292183876037598, "rewards/rollout_reward_func/std": 6.724213600158691, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.28125, "sampling/sampling_logp_difference/mean": 0.32201647758483887, "step": 507, "step_time": 21.396149006002815 }, { "clip_ratio/high_max": 0.0023345597728621215, "clip_ratio/high_mean": 0.0023345597728621215, "clip_ratio/low_mean": 0.005925424688030034, "clip_ratio/low_min": 0.005925424688030034, "clip_ratio/region_mean": 0.008259984431788325, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 1242.6875, "completions/mean_terminated_length": 1242.6875, "completions/min_length": 1166.0, "completions/min_terminated_length": 1166.0, "entropy": 0.041837093885988, "epoch": 2.032000081280003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004546143114566803, "kl": 0.018579868832603097, "learning_rate": 7.462979998769283e-06, "loss": 0.0001, "num_tokens": 10305730.0, "reward": -2.7110838890075684, "reward_std": 9.271716117858887, "rewards/rollout_reward_func/mean": -2.7110838890075684, "rewards/rollout_reward_func/std": 9.271716117858887, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.875, "sampling/sampling_logp_difference/mean": 0.3217945098876953, "step": 508, "step_time": 21.07597572501254 }, { "clip_ratio/high_max": 0.005421290756203234, "clip_ratio/high_mean": 0.005421290756203234, "clip_ratio/low_mean": 0.003969948971644044, "clip_ratio/low_min": 0.003969948971644044, "clip_ratio/region_mean": 0.009391239727847278, "completions/clipped_ratio": 0.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 1239.0625, "completions/mean_terminated_length": 1239.0625, "completions/min_length": 1168.0, "completions/min_terminated_length": 1168.0, "entropy": 0.04205429879948497, "epoch": 2.0360000814400032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007813678123056889, "kl": 0.019708371139131486, "learning_rate": 7.462979998764063e-06, "loss": 0.0001, "num_tokens": 10338450.0, "reward": -4.145186424255371, "reward_std": 7.704533576965332, "rewards/rollout_reward_func/mean": -4.145186424255371, "rewards/rollout_reward_func/std": 7.704533100128174, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.09375, "sampling/sampling_logp_difference/mean": 0.3174324035644531, "step": 509, "step_time": 21.25266227400425 }, { "clip_ratio/high_max": 0.0036263592774048448, "clip_ratio/high_mean": 0.0036263592774048448, "clip_ratio/low_mean": 0.005006377585232258, "clip_ratio/low_min": 0.005006377585232258, "clip_ratio/region_mean": 0.008632736804429442, "completions/clipped_ratio": 0.0, "completions/max_length": 1358.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 1263.1875, "completions/mean_terminated_length": 1263.1875, "completions/min_length": 1169.0, "completions/min_terminated_length": 1169.0, "entropy": 0.04082600865513086, "epoch": 2.0400000816000033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010368920862674713, "kl": 0.019936417462304235, "learning_rate": 7.462979998758832e-06, "loss": 0.0001, "num_tokens": 10371565.0, "reward": -3.136538505554199, "reward_std": 8.023064613342285, "rewards/rollout_reward_func/mean": -3.136538505554199, "rewards/rollout_reward_func/std": 8.023064613342285, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.359378814697266, "sampling/sampling_logp_difference/mean": 0.30964186787605286, "step": 510, "step_time": 21.102456446998985 }, { "clip_ratio/high_max": 0.005306179053150117, "clip_ratio/high_mean": 0.005306179053150117, "clip_ratio/low_mean": 0.004429921769769862, "clip_ratio/low_min": 0.004429921769769862, "clip_ratio/region_mean": 0.009736100735608488, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 1233.5, "completions/mean_terminated_length": 1233.5, "completions/min_length": 1174.0, "completions/min_terminated_length": 1174.0, "entropy": 0.04303614376112819, "epoch": 2.0440000817600034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038824453949928284, "kl": 0.016344029922038317, "learning_rate": 7.46297999875359e-06, "loss": 0.0001, "num_tokens": 10404175.0, "reward": -1.9289782047271729, "reward_std": 8.015661239624023, "rewards/rollout_reward_func/mean": -1.9289782047271729, "rewards/rollout_reward_func/std": 8.015661239624023, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.0000114440918, "sampling/sampling_logp_difference/mean": 0.31919360160827637, "step": 511, "step_time": 21.246796339997672 }, { "clip_ratio/high_max": 0.0035351132974028587, "clip_ratio/high_mean": 0.0035351132974028587, "clip_ratio/low_mean": 0.005508789443410933, "clip_ratio/low_min": 0.005508789443410933, "clip_ratio/region_mean": 0.009043902799021453, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 1122.4375, "completions/mean_terminated_length": 1122.4375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.04473715089261532, "epoch": 2.048000081920003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004309594631195068, "kl": 0.01818866585381329, "learning_rate": 7.462979998748336e-06, "loss": 0.0001, "num_tokens": 10435012.0, "reward": 7.6955037117004395, "reward_std": 36.03250503540039, "rewards/rollout_reward_func/mean": 7.6955037117004395, "rewards/rollout_reward_func/std": 36.03250503540039, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.11723709106445, "sampling/sampling_logp_difference/mean": 0.32893019914627075, "step": 512, "step_time": 20.648080893013685 }, { "clip_ratio/high_max": 0.0041110008605755866, "clip_ratio/high_mean": 0.0041110008605755866, "clip_ratio/low_mean": 0.00459764213883318, "clip_ratio/low_min": 0.00459764213883318, "clip_ratio/region_mean": 0.008708643086720258, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 1257.5625, "completions/mean_terminated_length": 1257.5625, "completions/min_length": 1193.0, "completions/min_terminated_length": 1193.0, "entropy": 0.042006509844213724, "epoch": 2.0520000820800032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004242736380547285, "kl": 0.019074979121796787, "learning_rate": 7.462979998743071e-06, "loss": 0.0001, "num_tokens": 10468039.0, "reward": -4.514679908752441, "reward_std": 18.381126403808594, "rewards/rollout_reward_func/mean": -4.514679908752441, "rewards/rollout_reward_func/std": 18.381128311157227, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.03125, "sampling/sampling_logp_difference/mean": 0.3103182017803192, "step": 513, "step_time": 21.32672909300163 }, { "clip_ratio/high_max": 0.0042037334642373025, "clip_ratio/high_mean": 0.0042037334642373025, "clip_ratio/low_mean": 0.0045150140358600765, "clip_ratio/low_min": 0.0045150140358600765, "clip_ratio/region_mean": 0.008718747470993549, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 1247.5625, "completions/mean_terminated_length": 1247.5625, "completions/min_length": 1167.0, "completions/min_terminated_length": 1167.0, "entropy": 0.04049374861642718, "epoch": 2.0560000822400033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003452722216024995, "kl": 0.016639975714497268, "learning_rate": 7.462979998737795e-06, "loss": 0.0001, "num_tokens": 10500919.0, "reward": -2.6178953647613525, "reward_std": 6.7844133377075195, "rewards/rollout_reward_func/mean": -2.6178953647613525, "rewards/rollout_reward_func/std": 6.784414291381836, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.781253814697266, "sampling/sampling_logp_difference/mean": 0.3187319040298462, "step": 514, "step_time": 21.322253699981957 }, { "clip_ratio/high_max": 0.004360616760095581, "clip_ratio/high_mean": 0.004360616760095581, "clip_ratio/low_mean": 0.004793090367456898, "clip_ratio/low_min": 0.004793090367456898, "clip_ratio/region_mean": 0.009153707243967801, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 1250.625, "completions/mean_terminated_length": 1250.625, "completions/min_length": 1158.0, "completions/min_terminated_length": 1158.0, "entropy": 0.039672973565757275, "epoch": 2.0600000824000034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0067281476221978664, "kl": 0.01909179100766778, "learning_rate": 7.462979998732508e-06, "loss": 0.0001, "num_tokens": 10533826.0, "reward": -2.1544687747955322, "reward_std": 8.349132537841797, "rewards/rollout_reward_func/mean": -2.1544687747955322, "rewards/rollout_reward_func/std": 8.349132537841797, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.21875, "sampling/sampling_logp_difference/mean": 0.32055699825286865, "step": 515, "step_time": 21.303847967006732 }, { "clip_ratio/high_max": 0.005851482041180134, "clip_ratio/high_mean": 0.005851482041180134, "clip_ratio/low_mean": 0.0033553403918631375, "clip_ratio/low_min": 0.0033553403918631375, "clip_ratio/region_mean": 0.009206822433043271, "completions/clipped_ratio": 0.0, "completions/max_length": 1331.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 1290.375, "completions/mean_terminated_length": 1290.375, "completions/min_length": 1237.0, "completions/min_terminated_length": 1237.0, "entropy": 0.040113610215485096, "epoch": 2.0640000825600034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029269037768244743, "kl": 0.014193754526786506, "learning_rate": 7.46297999872721e-06, "loss": 0.0001, "num_tokens": 10567408.0, "reward": -4.16780424118042, "reward_std": 11.31328296661377, "rewards/rollout_reward_func/mean": -4.16780424118042, "rewards/rollout_reward_func/std": 11.31328296661377, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.0, "sampling/sampling_logp_difference/mean": 0.2984470725059509, "step": 516, "step_time": 21.32191394700203 }, { "clip_ratio/high_max": 0.005476931110024452, "clip_ratio/high_mean": 0.005476931110024452, "clip_ratio/low_mean": 0.0040703570120967925, "clip_ratio/low_min": 0.0040703570120967925, "clip_ratio/region_mean": 0.009547288122121245, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 1225.625, "completions/mean_terminated_length": 1225.625, "completions/min_length": 1157.0, "completions/min_terminated_length": 1157.0, "entropy": 0.041330878622829914, "epoch": 2.0680000827200032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010554199106991291, "kl": 0.019442893331870437, "learning_rate": 7.4629799987219025e-06, "loss": 0.0001, "num_tokens": 10599895.0, "reward": -5.994854927062988, "reward_std": 5.033411502838135, "rewards/rollout_reward_func/mean": -5.994854927062988, "rewards/rollout_reward_func/std": 5.033411502838135, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.03125, "sampling/sampling_logp_difference/mean": 0.3150887191295624, "step": 517, "step_time": 21.387828019018343 }, { "clip_ratio/high_max": 0.004089239577297121, "clip_ratio/high_mean": 0.004089239577297121, "clip_ratio/low_mean": 0.005572783556999639, "clip_ratio/low_min": 0.005572783556999639, "clip_ratio/region_mean": 0.009662023221608251, "completions/clipped_ratio": 0.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 1148.0625, "completions/mean_terminated_length": 1148.0625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.04191939067095518, "epoch": 2.0720000828800033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024079482536762953, "kl": 0.015442822477780282, "learning_rate": 7.462979998716582e-06, "loss": 0.0001, "num_tokens": 10631166.0, "reward": 6.236478328704834, "reward_std": 30.393550872802734, "rewards/rollout_reward_func/mean": 6.236478328704834, "rewards/rollout_reward_func/std": 30.393550872802734, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.5380859375, "sampling/sampling_logp_difference/mean": 0.31486621499061584, "step": 518, "step_time": 20.672429426995222 }, { "clip_ratio/high_max": 0.0046849254867993295, "clip_ratio/high_mean": 0.0046849254867993295, "clip_ratio/low_mean": 0.005444540758617222, "clip_ratio/low_min": 0.005444540758617222, "clip_ratio/region_mean": 0.01012946612900123, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 1237.0625, "completions/mean_terminated_length": 1237.0625, "completions/min_length": 1150.0, "completions/min_terminated_length": 1150.0, "entropy": 0.04305187566205859, "epoch": 2.0760000830400033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007799023296684027, "kl": 0.01760792499408126, "learning_rate": 7.462979998711251e-06, "loss": 0.0001, "num_tokens": 10663846.0, "reward": -3.4728188514709473, "reward_std": 8.990240097045898, "rewards/rollout_reward_func/mean": -3.4728188514709473, "rewards/rollout_reward_func/std": 8.990240097045898, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.56252670288086, "sampling/sampling_logp_difference/mean": 0.3212067186832428, "step": 519, "step_time": 21.28726485198422 }, { "clip_ratio/high_max": 0.006752288492862135, "clip_ratio/high_mean": 0.006752288492862135, "clip_ratio/low_mean": 0.00425958467531018, "clip_ratio/low_min": 0.00425958467531018, "clip_ratio/region_mean": 0.011011873255483806, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 1272.6875, "completions/mean_terminated_length": 1272.6875, "completions/min_length": 1164.0, "completions/min_terminated_length": 1164.0, "entropy": 0.042315938510000706, "epoch": 2.0800000832000034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004213757812976837, "kl": 0.016590268351137638, "learning_rate": 7.46297999870591e-06, "loss": 0.0001, "num_tokens": 10697136.0, "reward": -4.449126243591309, "reward_std": 13.623167037963867, "rewards/rollout_reward_func/mean": -4.449126243591309, "rewards/rollout_reward_func/std": 13.623167037963867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.328125, "sampling/sampling_logp_difference/mean": 0.32079315185546875, "step": 520, "step_time": 21.44787868000276 }, { "clip_ratio/high_max": 0.005546717962715775, "clip_ratio/high_mean": 0.005546717962715775, "clip_ratio/low_mean": 0.003741365479072556, "clip_ratio/low_min": 0.003741365479072556, "clip_ratio/region_mean": 0.009288083470892161, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 1275.8125, "completions/mean_terminated_length": 1275.8125, "completions/min_length": 1170.0, "completions/min_terminated_length": 1170.0, "entropy": 0.0382139952853322, "epoch": 2.084000083360003e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012354240752756596, "kl": 0.017858589068055153, "learning_rate": 7.462979998700557e-06, "loss": 0.0001, "num_tokens": 10730476.0, "reward": -5.299997329711914, "reward_std": 4.3402838706970215, "rewards/rollout_reward_func/mean": -5.299997329711914, "rewards/rollout_reward_func/std": 4.34028434753418, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.21875, "sampling/sampling_logp_difference/mean": 0.30847638845443726, "step": 521, "step_time": 21.2421956199978 }, { "clip_ratio/high_max": 0.003948580037103966, "clip_ratio/high_mean": 0.003948580037103966, "clip_ratio/low_mean": 0.00495024467818439, "clip_ratio/low_min": 0.00495024467818439, "clip_ratio/region_mean": 0.008898824802599847, "completions/clipped_ratio": 0.0, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 1209.875, "completions/mean_terminated_length": 1209.875, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "entropy": 0.043176814913749695, "epoch": 2.0880000835200032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028872855473309755, "kl": 0.014355930499732494, "learning_rate": 7.462979998695193e-06, "loss": 0.0001, "num_tokens": 10762724.0, "reward": 1.4348344802856445, "reward_std": 37.13094711303711, "rewards/rollout_reward_func/mean": 1.4348344802856445, "rewards/rollout_reward_func/std": 37.130950927734375, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.1875, "sampling/sampling_logp_difference/mean": 0.33716946840286255, "step": 522, "step_time": 21.181227915993077 }, { "clip_ratio/high_max": 0.004463289165869355, "clip_ratio/high_mean": 0.004463289165869355, "clip_ratio/low_mean": 0.006052410113625228, "clip_ratio/low_min": 0.006052410113625228, "clip_ratio/region_mean": 0.010515699395909905, "completions/clipped_ratio": 0.0, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 1170.8125, "completions/mean_terminated_length": 1170.8125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.0426119570620358, "epoch": 2.0920000836800033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01751049980521202, "kl": 0.021391858346760273, "learning_rate": 7.462979998689817e-06, "loss": 0.0001, "num_tokens": 10794349.0, "reward": 0.7621612548828125, "reward_std": 25.087360382080078, "rewards/rollout_reward_func/mean": 0.7621612548828125, "rewards/rollout_reward_func/std": 25.087360382080078, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.625, "sampling/sampling_logp_difference/mean": 0.32560181617736816, "step": 523, "step_time": 20.988871069996094 }, { "clip_ratio/high_max": 0.004877602186752483, "clip_ratio/high_mean": 0.004877602186752483, "clip_ratio/low_mean": 0.004073576652444899, "clip_ratio/low_min": 0.004073576652444899, "clip_ratio/region_mean": 0.008951178984716535, "completions/clipped_ratio": 0.0, "completions/max_length": 1342.0, "completions/max_terminated_length": 1342.0, "completions/mean_length": 1271.8125, "completions/mean_terminated_length": 1271.8125, "completions/min_length": 1172.0, "completions/min_terminated_length": 1172.0, "entropy": 0.0403295480646193, "epoch": 2.0960000838400034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006488904822617769, "kl": 0.017256429768167436, "learning_rate": 7.462979998684431e-06, "loss": 0.0001, "num_tokens": 10827621.0, "reward": -4.75227689743042, "reward_std": 9.842402458190918, "rewards/rollout_reward_func/mean": -4.75227689743042, "rewards/rollout_reward_func/std": 9.842402458190918, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.65625, "sampling/sampling_logp_difference/mean": 0.32640793919563293, "step": 524, "step_time": 21.446400772991183 }, { "clip_ratio/high_max": 0.004633582720998675, "clip_ratio/high_mean": 0.004633582720998675, "clip_ratio/low_mean": 0.003726722818100825, "clip_ratio/low_min": 0.003726722818100825, "clip_ratio/region_mean": 0.00836030556820333, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 1298.25, "completions/mean_terminated_length": 1298.25, "completions/min_length": 1241.0, "completions/min_terminated_length": 1241.0, "entropy": 0.03787679085507989, "epoch": 2.1000000840000035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012843688949942589, "kl": 0.018334439140744507, "learning_rate": 7.462979998679033e-06, "loss": 0.0001, "num_tokens": 10861337.0, "reward": -5.279171943664551, "reward_std": 8.624122619628906, "rewards/rollout_reward_func/mean": -5.279171943664551, "rewards/rollout_reward_func/std": 8.624122619628906, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.46875, "sampling/sampling_logp_difference/mean": 0.3108331859111786, "step": 525, "step_time": 21.546395712000958 }, { "clip_ratio/high_max": 0.003544144274201244, "clip_ratio/high_mean": 0.003544144274201244, "clip_ratio/low_mean": 0.005439832166302949, "clip_ratio/low_min": 0.005439832166302949, "clip_ratio/region_mean": 0.008983976498711854, "completions/clipped_ratio": 0.0, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 1217.5, "completions/mean_terminated_length": 1217.5, "completions/min_length": 1171.0, "completions/min_terminated_length": 1171.0, "entropy": 0.043404740281403065, "epoch": 2.1040000841600032e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022695250809192657, "kl": 0.014587631332688034, "learning_rate": 7.462979998673626e-06, "loss": 0.0001, "num_tokens": 10893694.0, "reward": -3.4616565704345703, "reward_std": 9.254555702209473, "rewards/rollout_reward_func/mean": -3.4616565704345703, "rewards/rollout_reward_func/std": 9.254555702209473, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.0, "sampling/sampling_logp_difference/mean": 0.3213300406932831, "step": 526, "step_time": 21.18363817399222 }, { "clip_ratio/high_max": 0.005043802957516164, "clip_ratio/high_mean": 0.005043802957516164, "clip_ratio/low_mean": 0.004683645471232012, "clip_ratio/low_min": 0.004683645471232012, "clip_ratio/region_mean": 0.009727448341436684, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 1155.0, "completions/mean_terminated_length": 1155.0, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.040993707254529, "epoch": 2.1080000843200033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010527383536100388, "kl": 0.020169267896562815, "learning_rate": 7.462979998668206e-06, "loss": 0.0001, "num_tokens": 10925040.0, "reward": -0.5707104206085205, "reward_std": 24.296987533569336, "rewards/rollout_reward_func/mean": -0.5707104206085205, "rewards/rollout_reward_func/std": 24.296987533569336, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.4375, "sampling/sampling_logp_difference/mean": 0.3252147138118744, "step": 527, "step_time": 20.57996595599252 }, { "clip_ratio/high_max": 0.0028158065979368985, "clip_ratio/high_mean": 0.0028158065979368985, "clip_ratio/low_mean": 0.006215094588696957, "clip_ratio/low_min": 0.006215094588696957, "clip_ratio/region_mean": 0.009030901244841516, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 1294.5, "completions/mean_terminated_length": 1294.5, "completions/min_length": 1224.0, "completions/min_terminated_length": 1224.0, "entropy": 0.04038167838007212, "epoch": 2.1120000844800034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003636921290308237, "kl": 0.013139684800989926, "learning_rate": 7.462979998662775e-06, "loss": 0.0001, "num_tokens": 10958696.0, "reward": -0.7991265654563904, "reward_std": 11.434734344482422, "rewards/rollout_reward_func/mean": -0.7991265654563904, "rewards/rollout_reward_func/std": 11.434734344482422, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.687530517578125, "sampling/sampling_logp_difference/mean": 0.3290092349052429, "step": 528, "step_time": 21.372452372990665 }, { "clip_ratio/high_max": 0.004750133666675538, "clip_ratio/high_mean": 0.004750133666675538, "clip_ratio/low_mean": 0.004896203055977821, "clip_ratio/low_min": 0.004896203055977821, "clip_ratio/region_mean": 0.00964633672265336, "completions/clipped_ratio": 0.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 1242.625, "completions/mean_terminated_length": 1242.625, "completions/min_length": 1184.0, "completions/min_terminated_length": 1184.0, "entropy": 0.04091883171349764, "epoch": 2.1160000846400035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038145517464727163, "kl": 0.014156820659991354, "learning_rate": 7.462979998657335e-06, "loss": 0.0001, "num_tokens": 10991477.0, "reward": -3.9555487632751465, "reward_std": 8.525548934936523, "rewards/rollout_reward_func/mean": -3.9555487632751465, "rewards/rollout_reward_func/std": 8.52554988861084, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.937503814697266, "sampling/sampling_logp_difference/mean": 0.3108269274234772, "step": 529, "step_time": 21.22897573699447 }, { "clip_ratio/high_max": 0.004928094887873158, "clip_ratio/high_mean": 0.004928094887873158, "clip_ratio/low_mean": 0.005101059941807762, "clip_ratio/low_min": 0.005101059941807762, "clip_ratio/region_mean": 0.010029154771473259, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 1157.3125, "completions/mean_terminated_length": 1157.3125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.042969174683094025, "epoch": 2.1200000848000036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010390128009021282, "kl": 0.019424797501415014, "learning_rate": 7.4629799986518825e-06, "loss": 0.0001, "num_tokens": 11022865.0, "reward": 1.440397024154663, "reward_std": 24.9404296875, "rewards/rollout_reward_func/mean": 1.440397024154663, "rewards/rollout_reward_func/std": 24.9404296875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.875, "sampling/sampling_logp_difference/mean": 0.3248041570186615, "step": 530, "step_time": 20.32911943500949 }, { "clip_ratio/high_max": 0.005143544520251453, "clip_ratio/high_mean": 0.005143544520251453, "clip_ratio/low_mean": 0.005464960588142276, "clip_ratio/low_min": 0.005464960588142276, "clip_ratio/region_mean": 0.01060850522480905, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 1272.1875, "completions/mean_terminated_length": 1272.1875, "completions/min_length": 1179.0, "completions/min_terminated_length": 1179.0, "entropy": 0.03985893772915006, "epoch": 2.1240000849600033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002462019445374608, "kl": 0.01342407357878983, "learning_rate": 7.462979998646419e-06, "loss": 0.0001, "num_tokens": 11056133.0, "reward": -3.9312970638275146, "reward_std": 4.884079456329346, "rewards/rollout_reward_func/mean": -3.9312970638275146, "rewards/rollout_reward_func/std": 4.884079456329346, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.875022888183594, "sampling/sampling_logp_difference/mean": 0.3147628605365753, "step": 531, "step_time": 21.438091536016145 }, { "clip_ratio/high_max": 0.00286322069587186, "clip_ratio/high_mean": 0.00286322069587186, "clip_ratio/low_mean": 0.0063129707996267825, "clip_ratio/low_min": 0.0063129707996267825, "clip_ratio/region_mean": 0.009176191466394812, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 1213.0625, "completions/mean_terminated_length": 1213.0625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.041449159383773804, "epoch": 2.1280000851200034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005756467115134001, "kl": 0.01595125289168209, "learning_rate": 7.462979998640944e-06, "loss": 0.0001, "num_tokens": 11088444.0, "reward": -1.3660888671875, "reward_std": 28.06230354309082, "rewards/rollout_reward_func/mean": -1.3660888671875, "rewards/rollout_reward_func/std": 28.06230354309082, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.515625, "sampling/sampling_logp_difference/mean": 0.3260144889354706, "step": 532, "step_time": 21.05014824398677 }, { "clip_ratio/high_max": 0.005439427739474922, "clip_ratio/high_mean": 0.005439427739474922, "clip_ratio/low_mean": 0.0045986754121258855, "clip_ratio/low_min": 0.0045986754121258855, "clip_ratio/region_mean": 0.01003810332622379, "completions/clipped_ratio": 0.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 1207.8125, "completions/mean_terminated_length": 1207.8125, "completions/min_length": 1165.0, "completions/min_terminated_length": 1165.0, "entropy": 0.04316429328173399, "epoch": 2.1320000852800035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029022665694355965, "kl": 0.015339846373535693, "learning_rate": 7.462979998635458e-06, "loss": 0.0001, "num_tokens": 11120626.0, "reward": -5.9291181564331055, "reward_std": 7.317422389984131, "rewards/rollout_reward_func/mean": -5.9291181564331055, "rewards/rollout_reward_func/std": 7.317423343658447, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.828125, "sampling/sampling_logp_difference/mean": 0.329398512840271, "step": 533, "step_time": 20.690435006006737 }, { "clip_ratio/high_max": 0.003669997735414654, "clip_ratio/high_mean": 0.003669997735414654, "clip_ratio/low_mean": 0.004721552482806146, "clip_ratio/low_min": 0.004721552482806146, "clip_ratio/region_mean": 0.008391550101805478, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 1245.875, "completions/mean_terminated_length": 1245.875, "completions/min_length": 1186.0, "completions/min_terminated_length": 1186.0, "entropy": 0.040844430681318045, "epoch": 2.1360000854400035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.026124171912670135, "kl": 0.01926121744327247, "learning_rate": 7.462979998629962e-06, "loss": 0.0001, "num_tokens": 11153460.0, "reward": -1.8226407766342163, "reward_std": 8.888376235961914, "rewards/rollout_reward_func/mean": -1.8226407766342163, "rewards/rollout_reward_func/std": 8.888376235961914, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.4375, "sampling/sampling_logp_difference/mean": 0.31279706954956055, "step": 534, "step_time": 20.67473157501081 }, { "clip_ratio/high_max": 0.00482732240925543, "clip_ratio/high_mean": 0.00482732240925543, "clip_ratio/low_mean": 0.005106301890918985, "clip_ratio/low_min": 0.005106301890918985, "clip_ratio/region_mean": 0.009933624358382076, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 1254.9375, "completions/mean_terminated_length": 1254.9375, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "entropy": 0.040313185192644596, "epoch": 2.1400000856000033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004335680976510048, "kl": 0.018335316563025117, "learning_rate": 7.462979998624454e-06, "loss": 0.0001, "num_tokens": 11186435.0, "reward": -4.6623735427856445, "reward_std": 10.736213684082031, "rewards/rollout_reward_func/mean": -4.6623735427856445, "rewards/rollout_reward_func/std": 10.736213684082031, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.71875, "sampling/sampling_logp_difference/mean": 0.3229050040245056, "step": 535, "step_time": 21.269707719002326 }, { "clip_ratio/high_max": 0.004513251828029752, "clip_ratio/high_mean": 0.004513251828029752, "clip_ratio/low_mean": 0.0060381023795343935, "clip_ratio/low_min": 0.0060381023795343935, "clip_ratio/region_mean": 0.010551354265771806, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 1286.1875, "completions/mean_terminated_length": 1286.1875, "completions/min_length": 1234.0, "completions/min_terminated_length": 1234.0, "entropy": 0.04007962252944708, "epoch": 2.1440000857600034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003746128873899579, "kl": 0.01709703798405826, "learning_rate": 7.4629799986189355e-06, "loss": 0.0001, "num_tokens": 11219959.0, "reward": -3.961876153945923, "reward_std": 11.256902694702148, "rewards/rollout_reward_func/mean": -3.961876153945923, "rewards/rollout_reward_func/std": 11.256902694702148, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.3125, "sampling/sampling_logp_difference/mean": 0.32097917795181274, "step": 536, "step_time": 21.270771221003088 }, { "clip_ratio/high_max": 0.004767630365677178, "clip_ratio/high_mean": 0.004767630365677178, "clip_ratio/low_mean": 0.004943895852193236, "clip_ratio/low_min": 0.004943895852193236, "clip_ratio/region_mean": 0.009711526334285736, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 1199.8125, "completions/mean_terminated_length": 1199.8125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.039681649301201105, "epoch": 2.1480000859200034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034976413007825613, "kl": 0.013171162223443389, "learning_rate": 7.462979998613405e-06, "loss": 0.0001, "num_tokens": 11252064.0, "reward": 2.111711025238037, "reward_std": 21.388105392456055, "rewards/rollout_reward_func/mean": 2.111711025238037, "rewards/rollout_reward_func/std": 21.388107299804688, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.1875, "sampling/sampling_logp_difference/mean": 0.3139588236808777, "step": 537, "step_time": 21.06099017001543 }, { "clip_ratio/high_max": 0.004949772410327569, "clip_ratio/high_mean": 0.004949772410327569, "clip_ratio/low_mean": 0.004906963382381946, "clip_ratio/low_min": 0.004906963382381946, "clip_ratio/region_mean": 0.009856735763605684, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 1213.5625, "completions/mean_terminated_length": 1213.5625, "completions/min_length": 1151.0, "completions/min_terminated_length": 1151.0, "entropy": 0.04052913747727871, "epoch": 2.1520000860800035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006916518788784742, "kl": 0.021487224265001714, "learning_rate": 7.462979998607865e-06, "loss": 0.0001, "num_tokens": 11284327.0, "reward": -6.2899627685546875, "reward_std": 11.048513412475586, "rewards/rollout_reward_func/mean": -6.2899627685546875, "rewards/rollout_reward_func/std": 11.048514366149902, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.78125, "sampling/sampling_logp_difference/mean": 0.3430687189102173, "step": 538, "step_time": 20.747460165999655 }, { "clip_ratio/high_max": 0.00472208287101239, "clip_ratio/high_mean": 0.00472208287101239, "clip_ratio/low_mean": 0.005066392506705597, "clip_ratio/low_min": 0.005066392506705597, "clip_ratio/region_mean": 0.009788475348614156, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 1274.5, "completions/mean_terminated_length": 1274.5, "completions/min_length": 1188.0, "completions/min_terminated_length": 1188.0, "entropy": 0.03897418174892664, "epoch": 2.1560000862400036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0046687135472893715, "kl": 0.015414949506521225, "learning_rate": 7.462979998602313e-06, "loss": 0.0001, "num_tokens": 11317643.0, "reward": -3.709850788116455, "reward_std": 9.810515403747559, "rewards/rollout_reward_func/mean": -3.709850788116455, "rewards/rollout_reward_func/std": 9.810515403747559, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.5625, "sampling/sampling_logp_difference/mean": 0.3163127899169922, "step": 539, "step_time": 21.524496574005752 }, { "clip_ratio/high_max": 0.004073833872098476, "clip_ratio/high_mean": 0.004073833872098476, "clip_ratio/low_mean": 0.005087183613795787, "clip_ratio/low_min": 0.005087183613795787, "clip_ratio/region_mean": 0.009161017544101924, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 1256.25, "completions/mean_terminated_length": 1256.25, "completions/min_length": 1050.0, "completions/min_terminated_length": 1050.0, "entropy": 0.04125458840280771, "epoch": 2.1600000864000033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029650512151420116, "kl": 0.011094387853518128, "learning_rate": 7.46297999859675e-06, "loss": 0.0001, "num_tokens": 11350685.0, "reward": 3.0147061347961426, "reward_std": 15.599322319030762, "rewards/rollout_reward_func/mean": 3.0147061347961426, "rewards/rollout_reward_func/std": 15.599323272705078, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.806640625, "sampling/sampling_logp_difference/mean": 0.32594916224479675, "step": 540, "step_time": 21.112935155011655 }, { "clip_ratio/high_max": 0.0057026942376978695, "clip_ratio/high_mean": 0.0057026942376978695, "clip_ratio/low_mean": 0.0029166348977014422, "clip_ratio/low_min": 0.0029166348977014422, "clip_ratio/region_mean": 0.008619329135399312, "completions/clipped_ratio": 0.0, "completions/max_length": 1325.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 1223.875, "completions/mean_terminated_length": 1223.875, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "entropy": 0.042146185878664255, "epoch": 2.1640000865600034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0041093952022492886, "kl": 0.01598903222475201, "learning_rate": 7.462979998591176e-06, "loss": 0.0001, "num_tokens": 11383150.0, "reward": -3.665891170501709, "reward_std": 11.710331916809082, "rewards/rollout_reward_func/mean": -3.665891170501709, "rewards/rollout_reward_func/std": 11.710333824157715, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.500003814697266, "sampling/sampling_logp_difference/mean": 0.3143664002418518, "step": 541, "step_time": 21.451904108005692 }, { "clip_ratio/high_max": 0.0045437093649525195, "clip_ratio/high_mean": 0.0045437093649525195, "clip_ratio/low_mean": 0.0052967767405789346, "clip_ratio/low_min": 0.0052967767405789346, "clip_ratio/region_mean": 0.009840485989116132, "completions/clipped_ratio": 0.0, "completions/max_length": 1313.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 1250.6875, "completions/mean_terminated_length": 1250.6875, "completions/min_length": 1169.0, "completions/min_terminated_length": 1169.0, "entropy": 0.0391747378744185, "epoch": 2.1680000867200035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002759922994300723, "kl": 0.01104249432682991, "learning_rate": 7.462979998585591e-06, "loss": 0.0001, "num_tokens": 11416066.0, "reward": -4.167377471923828, "reward_std": 11.786316871643066, "rewards/rollout_reward_func/mean": -4.167377471923828, "rewards/rollout_reward_func/std": 11.786316871643066, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.671875, "sampling/sampling_logp_difference/mean": 0.3261106312274933, "step": 542, "step_time": 21.24479607601097 }, { "clip_ratio/high_max": 0.0038495755870826542, "clip_ratio/high_mean": 0.0038495755870826542, "clip_ratio/low_mean": 0.00491845965734683, "clip_ratio/low_min": 0.00491845965734683, "clip_ratio/region_mean": 0.008768035215325654, "completions/clipped_ratio": 0.0, "completions/max_length": 1357.0, "completions/max_terminated_length": 1357.0, "completions/mean_length": 1305.4375, "completions/mean_terminated_length": 1305.4375, "completions/min_length": 1236.0, "completions/min_terminated_length": 1236.0, "entropy": 0.040195011999458075, "epoch": 2.1720000868800036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002607793314382434, "kl": 0.01265962969046086, "learning_rate": 7.462979998579996e-06, "loss": 0.0001, "num_tokens": 11449907.0, "reward": -3.1596643924713135, "reward_std": 12.32529354095459, "rewards/rollout_reward_func/mean": -3.1596643924713135, "rewards/rollout_reward_func/std": 12.325295448303223, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.562503814697266, "sampling/sampling_logp_difference/mean": 0.3097037672996521, "step": 543, "step_time": 21.26217056599853 }, { "clip_ratio/high_max": 0.0052125577931292355, "clip_ratio/high_mean": 0.0052125577931292355, "clip_ratio/low_mean": 0.004725999344373122, "clip_ratio/low_min": 0.004725999344373122, "clip_ratio/region_mean": 0.009938557108398527, "completions/clipped_ratio": 0.0, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 1160.0625, "completions/mean_terminated_length": 1160.0625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.04120488930493593, "epoch": 2.1760000870400033e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028341037686914206, "kl": 0.015939991688355803, "learning_rate": 7.4629799985743875e-06, "loss": 0.0001, "num_tokens": 11481332.0, "reward": 1.5613999366760254, "reward_std": 24.931570053100586, "rewards/rollout_reward_func/mean": 1.5613999366760254, "rewards/rollout_reward_func/std": 24.93157196044922, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.28125, "sampling/sampling_logp_difference/mean": 0.3381688594818115, "step": 544, "step_time": 20.56163864701375 }, { "clip_ratio/high_max": 0.0045934864901937544, "clip_ratio/high_mean": 0.0045934864901937544, "clip_ratio/low_mean": 0.003847694839350879, "clip_ratio/low_min": 0.003847694839350879, "clip_ratio/region_mean": 0.008441181329544634, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 1261.0625, "completions/mean_terminated_length": 1261.0625, "completions/min_length": 1171.0, "completions/min_terminated_length": 1171.0, "entropy": 0.04245683131739497, "epoch": 2.1800000872000034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034458909649401903, "kl": 0.01563636097125709, "learning_rate": 7.46297999856877e-06, "loss": 0.0001, "num_tokens": 11514421.0, "reward": -3.7235968112945557, "reward_std": 5.326591968536377, "rewards/rollout_reward_func/mean": -3.7235968112945557, "rewards/rollout_reward_func/std": 5.326592445373535, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.703125, "sampling/sampling_logp_difference/mean": 0.316827654838562, "step": 545, "step_time": 21.399655863999214 }, { "clip_ratio/high_max": 0.004659040612750687, "clip_ratio/high_mean": 0.004659040612750687, "clip_ratio/low_mean": 0.004743735014926642, "clip_ratio/low_min": 0.004743735014926642, "clip_ratio/region_mean": 0.009402775554917753, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 1301.5625, "completions/mean_terminated_length": 1301.5625, "completions/min_length": 1215.0, "completions/min_terminated_length": 1215.0, "entropy": 0.0402155127376318, "epoch": 2.1840000873600035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.04210992529988289, "kl": 0.023153396206907928, "learning_rate": 7.4629799985631406e-06, "loss": 0.0001, "num_tokens": 11548189.0, "reward": -5.488546848297119, "reward_std": 10.968596458435059, "rewards/rollout_reward_func/mean": -5.488546848297119, "rewards/rollout_reward_func/std": 10.968596458435059, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.375, "sampling/sampling_logp_difference/mean": 0.308463990688324, "step": 546, "step_time": 21.029117830003088 }, { "clip_ratio/high_max": 0.0033645840303506702, "clip_ratio/high_mean": 0.0033645840303506702, "clip_ratio/low_mean": 0.006391011760570109, "clip_ratio/low_min": 0.006391011760570109, "clip_ratio/region_mean": 0.009755595703609288, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 1225.1875, "completions/mean_terminated_length": 1225.1875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.04000043077394366, "epoch": 2.1880000875200036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008726043626666069, "kl": 0.01735418231692165, "learning_rate": 7.4629799985575e-06, "loss": 0.0001, "num_tokens": 11580732.0, "reward": 2.420804500579834, "reward_std": 24.27250862121582, "rewards/rollout_reward_func/mean": 2.420804500579834, "rewards/rollout_reward_func/std": 24.27250862121582, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.4296875, "sampling/sampling_logp_difference/mean": 0.3194635808467865, "step": 547, "step_time": 21.034066331005306 }, { "clip_ratio/high_max": 0.004676664422731847, "clip_ratio/high_mean": 0.004676664422731847, "clip_ratio/low_mean": 0.004831183759961277, "clip_ratio/low_min": 0.004831183759961277, "clip_ratio/region_mean": 0.009507848240900785, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 1288.5625, "completions/mean_terminated_length": 1288.5625, "completions/min_length": 1190.0, "completions/min_terminated_length": 1190.0, "entropy": 0.038511448074132204, "epoch": 2.1920000876800037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0613638199865818, "kl": 0.028152960643637925, "learning_rate": 7.462979998551849e-06, "loss": 0.0002, "num_tokens": 11614281.0, "reward": -8.000301361083984, "reward_std": 12.404560089111328, "rewards/rollout_reward_func/mean": -8.000301361083984, "rewards/rollout_reward_func/std": 12.404560089111328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.51606369018555, "sampling/sampling_logp_difference/mean": 0.3071022033691406, "step": 548, "step_time": 21.415444082012982 }, { "clip_ratio/high_max": 0.005651243031024933, "clip_ratio/high_mean": 0.005651243031024933, "clip_ratio/low_mean": 0.0041297129355371, "clip_ratio/low_min": 0.0041297129355371, "clip_ratio/region_mean": 0.009780955966562033, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 1292.5625, "completions/mean_terminated_length": 1292.5625, "completions/min_length": 1194.0, "completions/min_terminated_length": 1194.0, "entropy": 0.03946202341467142, "epoch": 2.1960000878400034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002642995910719037, "kl": 0.013279268168844283, "learning_rate": 7.462979998546187e-06, "loss": 0.0001, "num_tokens": 11647913.0, "reward": -7.2775163650512695, "reward_std": 8.515360832214355, "rewards/rollout_reward_func/mean": -7.2775163650512695, "rewards/rollout_reward_func/std": 8.515361785888672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.875, "sampling/sampling_logp_difference/mean": 0.31023621559143066, "step": 549, "step_time": 21.18509838001046 }, { "clip_ratio/high_max": 0.004446469625690952, "clip_ratio/high_mean": 0.004446469625690952, "clip_ratio/low_mean": 0.005374069005483761, "clip_ratio/low_min": 0.005374069005483761, "clip_ratio/region_mean": 0.009820538689382374, "completions/clipped_ratio": 0.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 1233.3125, "completions/mean_terminated_length": 1233.3125, "completions/min_length": 1160.0, "completions/min_terminated_length": 1160.0, "entropy": 0.04123372258618474, "epoch": 2.2000000880000035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005274415947496891, "kl": 0.016586533864028752, "learning_rate": 7.462979998540513e-06, "loss": 0.0001, "num_tokens": 11680534.0, "reward": 3.692507266998291, "reward_std": 10.53227710723877, "rewards/rollout_reward_func/mean": 3.692507266998291, "rewards/rollout_reward_func/std": 10.53227710723877, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.814453125, "sampling/sampling_logp_difference/mean": 0.3228020966053009, "step": 550, "step_time": 21.248482679991866 }, { "clip_ratio/high_max": 0.004215940600261092, "clip_ratio/high_mean": 0.004215940600261092, "clip_ratio/low_mean": 0.0056504081876482815, "clip_ratio/low_min": 0.0056504081876482815, "clip_ratio/region_mean": 0.009866348817013204, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 1223.0625, "completions/mean_terminated_length": 1223.0625, "completions/min_length": 1129.0, "completions/min_terminated_length": 1129.0, "entropy": 0.04146201629191637, "epoch": 2.2040000881600036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037865550257265568, "kl": 0.013673859531991184, "learning_rate": 7.4629799985348285e-06, "loss": 0.0001, "num_tokens": 11712987.0, "reward": -2.8815197944641113, "reward_std": 6.187892436981201, "rewards/rollout_reward_func/mean": -2.8815197944641113, "rewards/rollout_reward_func/std": 6.187891960144043, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.375, "sampling/sampling_logp_difference/mean": 0.34493348002433777, "step": 551, "step_time": 21.258278507004434 }, { "clip_ratio/high_max": 0.0043864815452252515, "clip_ratio/high_mean": 0.0043864815452252515, "clip_ratio/low_mean": 0.00407654361333698, "clip_ratio/low_min": 0.00407654361333698, "clip_ratio/region_mean": 0.008463025209493935, "completions/clipped_ratio": 0.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 1212.75, "completions/mean_terminated_length": 1212.75, "completions/min_length": 1183.0, "completions/min_terminated_length": 1183.0, "entropy": 0.04038800299167633, "epoch": 2.2080000883200036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.015731358900666237, "kl": 0.020113181322813034, "learning_rate": 7.462979998529133e-06, "loss": 0.0001, "num_tokens": 11745266.0, "reward": -4.331628799438477, "reward_std": 2.6403768062591553, "rewards/rollout_reward_func/mean": -4.331628799438477, "rewards/rollout_reward_func/std": 2.6403768062591553, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.53125, "sampling/sampling_logp_difference/mean": 0.3285443186759949, "step": 552, "step_time": 20.748882127998513 }, { "clip_ratio/high_max": 0.002920878498116508, "clip_ratio/high_mean": 0.002920878498116508, "clip_ratio/low_mean": 0.007842441031243652, "clip_ratio/low_min": 0.007842441031243652, "clip_ratio/region_mean": 0.01076331955846399, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 1082.25, "completions/mean_terminated_length": 1082.25, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.044889742974191904, "epoch": 2.2120000884800034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.034741029143333435, "kl": 0.025048977346159518, "learning_rate": 7.462979998523427e-06, "loss": 0.0001, "num_tokens": 11775436.0, "reward": 6.329862117767334, "reward_std": 32.893882751464844, "rewards/rollout_reward_func/mean": 6.329862117767334, "rewards/rollout_reward_func/std": 32.893882751464844, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.6502799987793, "sampling/sampling_logp_difference/mean": 0.3415294289588928, "step": 553, "step_time": 23.705802600990864 }, { "clip_ratio/high_max": 0.0023737573937978595, "clip_ratio/high_mean": 0.0023737573937978595, "clip_ratio/low_mean": 0.007441435591317713, "clip_ratio/low_min": 0.007441435591317713, "clip_ratio/region_mean": 0.00981519283959642, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 1081.5, "completions/mean_terminated_length": 1081.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.04245189344510436, "epoch": 2.2160000886400035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012806428596377373, "kl": 0.017295413883402944, "learning_rate": 7.462979998517709e-06, "loss": 0.0001, "num_tokens": 11805597.0, "reward": 9.479105949401855, "reward_std": 33.62480163574219, "rewards/rollout_reward_func/mean": 9.479105949401855, "rewards/rollout_reward_func/std": 33.62480163574219, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.53125, "sampling/sampling_logp_difference/mean": 0.3395232856273651, "step": 554, "step_time": 19.449593583987735 }, { "clip_ratio/high_max": 0.004155957722105086, "clip_ratio/high_mean": 0.004155957722105086, "clip_ratio/low_mean": 0.005306767066940665, "clip_ratio/low_min": 0.005306767066940665, "clip_ratio/region_mean": 0.009462724789045751, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 1195.125, "completions/mean_terminated_length": 1195.125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.03955852473154664, "epoch": 2.2200000888000035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029775446746498346, "kl": 0.015520191751420498, "learning_rate": 7.46297999851198e-06, "loss": 0.0001, "num_tokens": 11837615.0, "reward": 0.18071269989013672, "reward_std": 27.610763549804688, "rewards/rollout_reward_func/mean": 0.18071269989013672, "rewards/rollout_reward_func/std": 27.610767364501953, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.171875, "sampling/sampling_logp_difference/mean": 0.31902676820755005, "step": 555, "step_time": 20.756461879995186 }, { "clip_ratio/high_max": 0.0036107131163589656, "clip_ratio/high_mean": 0.0036107131163589656, "clip_ratio/low_mean": 0.005873871559742838, "clip_ratio/low_min": 0.005873871559742838, "clip_ratio/region_mean": 0.009484584676101804, "completions/clipped_ratio": 0.0, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 1152.125, "completions/mean_terminated_length": 1152.125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.04344509495422244, "epoch": 2.2240000889600036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003536357544362545, "kl": 0.015957089024595916, "learning_rate": 7.46297999850624e-06, "loss": 0.0001, "num_tokens": 11868911.0, "reward": 4.81016731262207, "reward_std": 19.480083465576172, "rewards/rollout_reward_func/mean": 4.81016731262207, "rewards/rollout_reward_func/std": 19.480083465576172, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.140625, "sampling/sampling_logp_difference/mean": 0.32198718190193176, "step": 556, "step_time": 20.679046593999374 }, { "clip_ratio/high_max": 0.004161089455010369, "clip_ratio/high_mean": 0.004161089455010369, "clip_ratio/low_mean": 0.004447369748959318, "clip_ratio/low_min": 0.004447369748959318, "clip_ratio/region_mean": 0.008608459203969687, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 1241.9375, "completions/mean_terminated_length": 1241.9375, "completions/min_length": 1181.0, "completions/min_terminated_length": 1181.0, "entropy": 0.03982200240716338, "epoch": 2.2280000891200037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004193360451608896, "kl": 0.017739201430231333, "learning_rate": 7.462979998500491e-06, "loss": 0.0001, "num_tokens": 11901675.0, "reward": -8.389251708984375, "reward_std": 10.117805480957031, "rewards/rollout_reward_func/mean": -8.389251708984375, "rewards/rollout_reward_func/std": 10.117805480957031, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.40625, "sampling/sampling_logp_difference/mean": 0.3252720236778259, "step": 557, "step_time": 21.160291447995405 }, { "clip_ratio/high_max": 0.0048086108872666955, "clip_ratio/high_mean": 0.0048086108872666955, "clip_ratio/low_mean": 0.003924489778000861, "clip_ratio/low_min": 0.003924489778000861, "clip_ratio/region_mean": 0.008733100432436913, "completions/clipped_ratio": 0.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 1231.9375, "completions/mean_terminated_length": 1231.9375, "completions/min_length": 1170.0, "completions/min_terminated_length": 1170.0, "entropy": 0.039726578164845705, "epoch": 2.2320000892800035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00480961799621582, "kl": 0.015902490355074406, "learning_rate": 7.462979998494728e-06, "loss": 0.0001, "num_tokens": 11934283.0, "reward": -3.350034475326538, "reward_std": 7.007432460784912, "rewards/rollout_reward_func/mean": -3.350034475326538, "rewards/rollout_reward_func/std": 7.007432460784912, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.68752670288086, "sampling/sampling_logp_difference/mean": 0.312104195356369, "step": 558, "step_time": 21.220495095985825 }, { "clip_ratio/high_max": 0.0051538877305574715, "clip_ratio/high_mean": 0.0051538877305574715, "clip_ratio/low_mean": 0.004037840582896024, "clip_ratio/low_min": 0.004037840582896024, "clip_ratio/region_mean": 0.009191728371661156, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 1249.5625, "completions/mean_terminated_length": 1249.5625, "completions/min_length": 1171.0, "completions/min_terminated_length": 1171.0, "entropy": 0.0408984269015491, "epoch": 2.2360000894400035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.05207343026995659, "kl": 0.03191479039378464, "learning_rate": 7.462979998488955e-06, "loss": 0.0002, "num_tokens": 11967178.0, "reward": 0.49674713611602783, "reward_std": 10.828417778015137, "rewards/rollout_reward_func/mean": 0.49674713611602783, "rewards/rollout_reward_func/std": 10.828417778015137, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.031253814697266, "sampling/sampling_logp_difference/mean": 0.32006561756134033, "step": 559, "step_time": 21.175748973000736 }, { "clip_ratio/high_max": 0.004766366444528103, "clip_ratio/high_mean": 0.004766366444528103, "clip_ratio/low_mean": 0.004053625889355317, "clip_ratio/low_min": 0.004053625889355317, "clip_ratio/region_mean": 0.008819992246571928, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 1248.875, "completions/mean_terminated_length": 1248.875, "completions/min_length": 1174.0, "completions/min_terminated_length": 1174.0, "entropy": 0.04084308724850416, "epoch": 2.2400000896000036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004895514342933893, "kl": 0.017894530319608748, "learning_rate": 7.4629799984831705e-06, "loss": 0.0001, "num_tokens": 12000081.0, "reward": -5.203489303588867, "reward_std": 7.697603225708008, "rewards/rollout_reward_func/mean": -5.203489303588867, "rewards/rollout_reward_func/std": 7.697603702545166, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.031253814697266, "sampling/sampling_logp_difference/mean": 0.3049893081188202, "step": 560, "step_time": 20.853445883993118 }, { "clip_ratio/high_max": 0.0043573105649556965, "clip_ratio/high_mean": 0.0043573105649556965, "clip_ratio/low_mean": 0.005335238325642422, "clip_ratio/low_min": 0.005335238325642422, "clip_ratio/region_mean": 0.009692548832390457, "completions/clipped_ratio": 0.0, "completions/max_length": 1479.0, "completions/max_terminated_length": 1479.0, "completions/mean_length": 1452.9375, "completions/mean_terminated_length": 1452.9375, "completions/min_length": 1387.0, "completions/min_terminated_length": 1387.0, "entropy": 0.037755177821964025, "epoch": 2.2440000897600037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004873010329902172, "kl": 0.019569002324715257, "learning_rate": 7.462979998477376e-06, "loss": 0.0001, "num_tokens": 12036247.0, "reward": -2.0516748428344727, "reward_std": 11.170336723327637, "rewards/rollout_reward_func/mean": -2.0516748428344727, "rewards/rollout_reward_func/std": 11.170336723327637, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.75, "sampling/sampling_logp_difference/mean": 0.3051241934299469, "step": 561, "step_time": 22.8370193300143 }, { "clip_ratio/high_max": 0.005089975020382553, "clip_ratio/high_mean": 0.005089975020382553, "clip_ratio/low_mean": 0.0038314241683110595, "clip_ratio/low_min": 0.0038314241683110595, "clip_ratio/region_mean": 0.008921399246901274, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 1485.25, "completions/mean_terminated_length": 1485.25, "completions/min_length": 1426.0, "completions/min_terminated_length": 1426.0, "entropy": 0.039651522878557444, "epoch": 2.2480000899200034e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003503743326291442, "kl": 0.018042357987724245, "learning_rate": 7.46297999847157e-06, "loss": 0.0001, "num_tokens": 12072961.0, "reward": 0.2735464572906494, "reward_std": 13.448026657104492, "rewards/rollout_reward_func/mean": 0.2735464572906494, "rewards/rollout_reward_func/std": 13.448027610778809, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.59375, "sampling/sampling_logp_difference/mean": 0.29573869705200195, "step": 562, "step_time": 23.4126522990191 }, { "clip_ratio/high_max": 0.0029696481651626527, "clip_ratio/high_mean": 0.0029696481651626527, "clip_ratio/low_mean": 0.005165445269085467, "clip_ratio/low_min": 0.005165445269085467, "clip_ratio/region_mean": 0.008135093376040459, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1463.9375, "completions/mean_terminated_length": 1463.9375, "completions/min_length": 1377.0, "completions/min_terminated_length": 1377.0, "entropy": 0.03670479077845812, "epoch": 2.2520000900800035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.03146183118224144, "kl": 0.02293368522077799, "learning_rate": 7.462979998465753e-06, "loss": 0.0002, "num_tokens": 12109307.0, "reward": 3.17594051361084, "reward_std": 10.607872009277344, "rewards/rollout_reward_func/mean": 3.17594051361084, "rewards/rollout_reward_func/std": 10.60787296295166, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.9375, "sampling/sampling_logp_difference/mean": 0.2939675748348236, "step": 563, "step_time": 23.383430048983428 }, { "clip_ratio/high_max": 0.004016406659502536, "clip_ratio/high_mean": 0.004016406659502536, "clip_ratio/low_mean": 0.0047785789356566966, "clip_ratio/low_min": 0.0047785789356566966, "clip_ratio/region_mean": 0.008794985653366894, "completions/clipped_ratio": 0.0, "completions/max_length": 1525.0, "completions/max_terminated_length": 1525.0, "completions/mean_length": 1452.9375, "completions/mean_terminated_length": 1452.9375, "completions/min_length": 1377.0, "completions/min_terminated_length": 1377.0, "entropy": 0.04089589836075902, "epoch": 2.2560000902400036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003308656392619014, "kl": 0.017039593192748725, "learning_rate": 7.4629799984599245e-06, "loss": 0.0001, "num_tokens": 12145454.0, "reward": 2.223680019378662, "reward_std": 15.006898880004883, "rewards/rollout_reward_func/mean": 2.223680019378662, "rewards/rollout_reward_func/std": 15.0068998336792, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.3125, "sampling/sampling_logp_difference/mean": 0.29786011576652527, "step": 564, "step_time": 23.036295040998084 }, { "clip_ratio/high_max": 0.005515131109859794, "clip_ratio/high_mean": 0.005515131109859794, "clip_ratio/low_mean": 0.004069049522513524, "clip_ratio/low_min": 0.004069049522513524, "clip_ratio/region_mean": 0.009584180661477149, "completions/clipped_ratio": 0.0, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 1423.9375, "completions/mean_terminated_length": 1423.9375, "completions/min_length": 1372.0, "completions/min_terminated_length": 1372.0, "entropy": 0.04114045528694987, "epoch": 2.2600000904000037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003116633975878358, "kl": 0.018033225554972887, "learning_rate": 7.462979998454086e-06, "loss": 0.0001, "num_tokens": 12181134.0, "reward": 4.032574653625488, "reward_std": 11.381087303161621, "rewards/rollout_reward_func/mean": 4.032574653625488, "rewards/rollout_reward_func/std": 11.381087303161621, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.625, "sampling/sampling_logp_difference/mean": 0.30785229802131653, "step": 565, "step_time": 22.916189233001205 }, { "clip_ratio/high_max": 0.006804340024245903, "clip_ratio/high_mean": 0.006804340024245903, "clip_ratio/low_mean": 0.004071102244779468, "clip_ratio/low_min": 0.004071102244779468, "clip_ratio/region_mean": 0.010875442123506218, "completions/clipped_ratio": 0.0, "completions/max_length": 1496.0, "completions/max_terminated_length": 1496.0, "completions/mean_length": 1455.8125, "completions/mean_terminated_length": 1455.8125, "completions/min_length": 1403.0, "completions/min_terminated_length": 1403.0, "entropy": 0.039355546701699495, "epoch": 2.2640000905600038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004235721193253994, "kl": 0.01700774603523314, "learning_rate": 7.462979998448236e-06, "loss": 0.0001, "num_tokens": 12217347.0, "reward": -1.9233404397964478, "reward_std": 8.74042797088623, "rewards/rollout_reward_func/mean": -1.9233404397964478, "rewards/rollout_reward_func/std": 8.74042797088623, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.625, "sampling/sampling_logp_difference/mean": 0.2899053394794464, "step": 566, "step_time": 22.85405848400842 }, { "clip_ratio/high_max": 0.004126752144657075, "clip_ratio/high_mean": 0.004126752144657075, "clip_ratio/low_mean": 0.0046719823440071195, "clip_ratio/low_min": 0.0046719823440071195, "clip_ratio/region_mean": 0.008798734459560364, "completions/clipped_ratio": 0.0, "completions/max_length": 1477.0, "completions/max_terminated_length": 1477.0, "completions/mean_length": 1344.5, "completions/mean_terminated_length": 1344.5, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.04145451029762626, "epoch": 2.2680000907200035e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003058356698602438, "kl": 0.015545979258604348, "learning_rate": 7.462979998442375e-06, "loss": 0.0001, "num_tokens": 12251732.0, "reward": 3.32548451423645, "reward_std": 25.234874725341797, "rewards/rollout_reward_func/mean": 3.32548451423645, "rewards/rollout_reward_func/std": 25.234880447387695, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.07870101928711, "sampling/sampling_logp_difference/mean": 0.3075597584247589, "step": 567, "step_time": 22.543086227007734 }, { "clip_ratio/high_max": 0.00390168497688137, "clip_ratio/high_mean": 0.00390168497688137, "clip_ratio/low_mean": 0.005666384153300896, "clip_ratio/low_min": 0.005666384153300896, "clip_ratio/region_mean": 0.009568069130182266, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 1267.0, "completions/mean_terminated_length": 1267.0, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.04403846571221948, "epoch": 2.2720000908800036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028943244833499193, "kl": 0.016174888471141458, "learning_rate": 7.462979998436502e-06, "loss": 0.0001, "num_tokens": 12284892.0, "reward": 21.13507652282715, "reward_std": 38.409507751464844, "rewards/rollout_reward_func/mean": 21.13507652282715, "rewards/rollout_reward_func/std": 38.409507751464844, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.2265625, "sampling/sampling_logp_difference/mean": 0.3089614808559418, "step": 568, "step_time": 22.405939885989937 }, { "clip_ratio/high_max": 0.0032972040644381195, "clip_ratio/high_mean": 0.0032972040644381195, "clip_ratio/low_mean": 0.004605335445376113, "clip_ratio/low_min": 0.004605335445376113, "clip_ratio/region_mean": 0.007902539451606572, "completions/clipped_ratio": 0.0, "completions/max_length": 1523.0, "completions/max_terminated_length": 1523.0, "completions/mean_length": 1423.9375, "completions/mean_terminated_length": 1423.9375, "completions/min_length": 1366.0, "completions/min_terminated_length": 1366.0, "entropy": 0.03929546568542719, "epoch": 2.2760000910400037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00344831682741642, "kl": 0.015290501178242266, "learning_rate": 7.462979998430619e-06, "loss": 0.0001, "num_tokens": 12320570.0, "reward": -1.266954779624939, "reward_std": 8.01811695098877, "rewards/rollout_reward_func/mean": -1.266954779624939, "rewards/rollout_reward_func/std": 8.01811695098877, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.5, "sampling/sampling_logp_difference/mean": 0.2960357964038849, "step": 569, "step_time": 23.16505275599775 }, { "clip_ratio/high_max": 0.0032639759592711926, "clip_ratio/high_mean": 0.0032639759592711926, "clip_ratio/low_mean": 0.005739696236560121, "clip_ratio/low_min": 0.005739696236560121, "clip_ratio/region_mean": 0.009003672283142805, "completions/clipped_ratio": 0.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 1471.1875, "completions/mean_terminated_length": 1471.1875, "completions/min_length": 1415.0, "completions/min_terminated_length": 1415.0, "entropy": 0.03938249545171857, "epoch": 2.2800000912000037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003614873392507434, "kl": 0.016311903833411634, "learning_rate": 7.462979998424724e-06, "loss": 0.0001, "num_tokens": 12357050.0, "reward": -0.438518226146698, "reward_std": 11.767770767211914, "rewards/rollout_reward_func/mean": -0.438518226146698, "rewards/rollout_reward_func/std": 11.767770767211914, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.484375, "sampling/sampling_logp_difference/mean": 0.3024628460407257, "step": 570, "step_time": 23.588013420994685 }, { "clip_ratio/high_max": 0.005002300022169948, "clip_ratio/high_mean": 0.005002300022169948, "clip_ratio/low_mean": 0.004243656556354836, "clip_ratio/low_min": 0.004243656556354836, "clip_ratio/region_mean": 0.009245956549420953, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 1440.75, "completions/mean_terminated_length": 1440.75, "completions/min_length": 1346.0, "completions/min_terminated_length": 1346.0, "entropy": 0.039263790007680655, "epoch": 2.2840000913600038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.015598884783685207, "kl": 0.01801356708165258, "learning_rate": 7.46297999841882e-06, "loss": 0.0001, "num_tokens": 12393010.0, "reward": 0.9705657362937927, "reward_std": 12.468517303466797, "rewards/rollout_reward_func/mean": 0.9705657362937927, "rewards/rollout_reward_func/std": 12.46851634979248, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.458984375, "sampling/sampling_logp_difference/mean": 0.29760873317718506, "step": 571, "step_time": 23.736260848010716 }, { "clip_ratio/high_max": 0.004451516491826624, "clip_ratio/high_mean": 0.004451516491826624, "clip_ratio/low_mean": 0.004956183198373765, "clip_ratio/low_min": 0.004956183198373765, "clip_ratio/region_mean": 0.009407699457369745, "completions/clipped_ratio": 0.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 1373.1875, "completions/mean_terminated_length": 1373.1875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.04025691468268633, "epoch": 2.2880000915200036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.026254059746861458, "kl": 0.03218198171816766, "learning_rate": 7.462979998412902e-06, "loss": 0.0002, "num_tokens": 12427889.0, "reward": 9.224740028381348, "reward_std": 26.108966827392578, "rewards/rollout_reward_func/mean": 9.224740028381348, "rewards/rollout_reward_func/std": 26.10896873474121, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.34419631958008, "sampling/sampling_logp_difference/mean": 0.299796462059021, "step": 572, "step_time": 23.105644709001353 }, { "clip_ratio/high_max": 0.005252113507594913, "clip_ratio/high_mean": 0.005252113507594913, "clip_ratio/low_mean": 0.00381499485229142, "clip_ratio/low_min": 0.00381499485229142, "clip_ratio/region_mean": 0.009067108388990164, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 1443.1875, "completions/mean_terminated_length": 1443.1875, "completions/min_length": 1351.0, "completions/min_terminated_length": 1351.0, "entropy": 0.040498620364815, "epoch": 2.2920000916800036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004163495730608702, "kl": 0.016721164109185338, "learning_rate": 7.462979998406975e-06, "loss": 0.0001, "num_tokens": 12463878.0, "reward": -0.8717248439788818, "reward_std": 16.330171585083008, "rewards/rollout_reward_func/mean": -0.8717248439788818, "rewards/rollout_reward_func/std": 16.330171585083008, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.718753814697266, "sampling/sampling_logp_difference/mean": 0.3102433681488037, "step": 573, "step_time": 23.589545545000874 }, { "clip_ratio/high_max": 0.0048362494562752545, "clip_ratio/high_mean": 0.0048362494562752545, "clip_ratio/low_mean": 0.0040582526416983455, "clip_ratio/low_min": 0.0040582526416983455, "clip_ratio/region_mean": 0.008894501952454448, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1476.8125, "completions/mean_terminated_length": 1476.8125, "completions/min_length": 1414.0, "completions/min_terminated_length": 1414.0, "entropy": 0.036421021446585655, "epoch": 2.2960000918400037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00449670385569334, "kl": 0.01654039102140814, "learning_rate": 7.462979998401037e-06, "loss": 0.0001, "num_tokens": 12500434.0, "reward": 2.282651662826538, "reward_std": 16.358179092407227, "rewards/rollout_reward_func/mean": 2.282651662826538, "rewards/rollout_reward_func/std": 16.358179092407227, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.6875, "sampling/sampling_logp_difference/mean": 0.30176058411598206, "step": 574, "step_time": 23.59791247999965 }, { "clip_ratio/high_max": 0.006044322333764285, "clip_ratio/high_mean": 0.006044322333764285, "clip_ratio/low_mean": 0.0024813560594338924, "clip_ratio/low_min": 0.0024813560594338924, "clip_ratio/region_mean": 0.008525678305886686, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 1466.5625, "completions/mean_terminated_length": 1466.5625, "completions/min_length": 1393.0, "completions/min_terminated_length": 1393.0, "entropy": 0.037366037257015705, "epoch": 2.3000000920000038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003939955960959196, "kl": 0.016074639046564698, "learning_rate": 7.4629799983950875e-06, "loss": 0.0001, "num_tokens": 12536823.0, "reward": -3.2791318893432617, "reward_std": 10.406939506530762, "rewards/rollout_reward_func/mean": -3.2791318893432617, "rewards/rollout_reward_func/std": 10.406938552856445, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.90625, "sampling/sampling_logp_difference/mean": 0.3050403296947479, "step": 575, "step_time": 23.682830341022054 }, { "clip_ratio/high_max": 0.003906729951268062, "clip_ratio/high_mean": 0.003906729951268062, "clip_ratio/low_mean": 0.00573046863428317, "clip_ratio/low_min": 0.00573046863428317, "clip_ratio/region_mean": 0.009637198527343571, "completions/clipped_ratio": 0.0, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 1376.0, "completions/mean_terminated_length": 1376.0, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.04042747989296913, "epoch": 2.3040000921600036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002834181534126401, "kl": 0.014181062346324325, "learning_rate": 7.462979998389127e-06, "loss": 0.0001, "num_tokens": 12571739.0, "reward": 8.335563659667969, "reward_std": 24.489727020263672, "rewards/rollout_reward_func/mean": 8.335563659667969, "rewards/rollout_reward_func/std": 24.489727020263672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.156253814697266, "sampling/sampling_logp_difference/mean": 0.3003208339214325, "step": 576, "step_time": 23.05738609299442 }, { "clip_ratio/high_max": 0.005365763092413545, "clip_ratio/high_mean": 0.005365763092413545, "clip_ratio/low_mean": 0.004883060464635491, "clip_ratio/low_min": 0.004883060464635491, "clip_ratio/region_mean": 0.010248823557049036, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 1484.6875, "completions/mean_terminated_length": 1484.6875, "completions/min_length": 1407.0, "completions/min_terminated_length": 1407.0, "entropy": 0.03858310682699084, "epoch": 2.3080000923200036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012600992806255817, "kl": 0.015278342762030661, "learning_rate": 7.4629799983831545e-06, "loss": 0.0001, "num_tokens": 12608444.0, "reward": 0.397562712430954, "reward_std": 7.210548400878906, "rewards/rollout_reward_func/mean": 0.397562712430954, "rewards/rollout_reward_func/std": 7.2105488777160645, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.6875, "sampling/sampling_logp_difference/mean": 0.2984341084957123, "step": 577, "step_time": 23.702945530014404 }, { "clip_ratio/high_max": 0.0048180275480262935, "clip_ratio/high_mean": 0.0048180275480262935, "clip_ratio/low_mean": 0.005579610849963501, "clip_ratio/low_min": 0.005579610849963501, "clip_ratio/region_mean": 0.010397638310678303, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 1441.75, "completions/mean_terminated_length": 1441.75, "completions/min_length": 1352.0, "completions/min_terminated_length": 1352.0, "entropy": 0.0404215008020401, "epoch": 2.3120000924800037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0080475565046072, "kl": 0.018003654200583696, "learning_rate": 7.462979998377172e-06, "loss": 0.0001, "num_tokens": 12644392.0, "reward": -0.7258894443511963, "reward_std": 11.865927696228027, "rewards/rollout_reward_func/mean": -0.7258894443511963, "rewards/rollout_reward_func/std": 11.865927696228027, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.062503814697266, "sampling/sampling_logp_difference/mean": 0.2994309961795807, "step": 578, "step_time": 23.91983864899521 }, { "clip_ratio/high_max": 0.002920883009210229, "clip_ratio/high_mean": 0.002920883009210229, "clip_ratio/low_mean": 0.00551237360923551, "clip_ratio/low_min": 0.00551237360923551, "clip_ratio/region_mean": 0.00843325670575723, "completions/clipped_ratio": 0.0, "completions/max_length": 1463.0, "completions/max_terminated_length": 1463.0, "completions/mean_length": 1416.0, "completions/mean_terminated_length": 1416.0, "completions/min_length": 1366.0, "completions/min_terminated_length": 1366.0, "entropy": 0.0411951313726604, "epoch": 2.3160000926400038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005292879417538643, "kl": 0.017321814550086856, "learning_rate": 7.4629799983711775e-06, "loss": 0.0001, "num_tokens": 12679919.0, "reward": 0.4289880394935608, "reward_std": 10.989635467529297, "rewards/rollout_reward_func/mean": 0.4289880394935608, "rewards/rollout_reward_func/std": 10.989635467529297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.8125, "sampling/sampling_logp_difference/mean": 0.3100878596305847, "step": 579, "step_time": 23.061620235996088 }, { "clip_ratio/high_max": 0.004747363971546292, "clip_ratio/high_mean": 0.004747363971546292, "clip_ratio/low_mean": 0.00528340000892058, "clip_ratio/low_min": 0.00528340000892058, "clip_ratio/region_mean": 0.010030763922259212, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 1359.75, "completions/mean_terminated_length": 1359.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.0413097171112895, "epoch": 2.320000092800004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030898270197212696, "kl": 0.01561389269772917, "learning_rate": 7.462979998365173e-06, "loss": 0.0001, "num_tokens": 12714555.0, "reward": 3.402876615524292, "reward_std": 25.305898666381836, "rewards/rollout_reward_func/mean": 3.402876615524292, "rewards/rollout_reward_func/std": 25.305896759033203, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.28125, "sampling/sampling_logp_difference/mean": 0.3136935830116272, "step": 580, "step_time": 23.07783426399692 }, { "clip_ratio/high_max": 0.005331859953003004, "clip_ratio/high_mean": 0.005331859953003004, "clip_ratio/low_mean": 0.005061762349214405, "clip_ratio/low_min": 0.005061762349214405, "clip_ratio/region_mean": 0.010393622273113579, "completions/clipped_ratio": 0.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 1387.3125, "completions/mean_terminated_length": 1387.3125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.039988490752875805, "epoch": 2.3240000929600036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0047124531120061874, "kl": 0.01794806425459683, "learning_rate": 7.462979998359156e-06, "loss": 0.0001, "num_tokens": 12749652.0, "reward": 4.15208625793457, "reward_std": 24.43208885192871, "rewards/rollout_reward_func/mean": 4.15208625793457, "rewards/rollout_reward_func/std": 24.432090759277344, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.03125, "sampling/sampling_logp_difference/mean": 0.3070465326309204, "step": 581, "step_time": 22.927227037987905 }, { "clip_ratio/high_max": 0.004467463179025799, "clip_ratio/high_mean": 0.004467463179025799, "clip_ratio/low_mean": 0.004891304008197039, "clip_ratio/low_min": 0.004891304008197039, "clip_ratio/region_mean": 0.009358767187222838, "completions/clipped_ratio": 0.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 1462.875, "completions/mean_terminated_length": 1462.875, "completions/min_length": 1365.0, "completions/min_terminated_length": 1365.0, "entropy": 0.04134672041982412, "epoch": 2.3280000931200037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.021212570369243622, "kl": 0.020237171556800604, "learning_rate": 7.462979998353131e-06, "loss": 0.0001, "num_tokens": 12785974.0, "reward": -2.7302629947662354, "reward_std": 16.106035232543945, "rewards/rollout_reward_func/mean": -2.7302629947662354, "rewards/rollout_reward_func/std": 16.106035232543945, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.84375, "sampling/sampling_logp_difference/mean": 0.3004067540168762, "step": 582, "step_time": 23.612995508003223 }, { "clip_ratio/high_max": 0.004740564327221364, "clip_ratio/high_mean": 0.004740564327221364, "clip_ratio/low_mean": 0.0048537804977968335, "clip_ratio/low_min": 0.0048537804977968335, "clip_ratio/region_mean": 0.009594344883225858, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 1479.1875, "completions/mean_terminated_length": 1479.1875, "completions/min_length": 1383.0, "completions/min_terminated_length": 1383.0, "entropy": 0.03602872882038355, "epoch": 2.3320000932800038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006511662155389786, "kl": 0.014530060230754316, "learning_rate": 7.462979998347092e-06, "loss": 0.0001, "num_tokens": 12822579.0, "reward": -1.7809748649597168, "reward_std": 12.37973403930664, "rewards/rollout_reward_func/mean": -1.7809748649597168, "rewards/rollout_reward_func/std": 12.379734992980957, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.875, "sampling/sampling_logp_difference/mean": 0.3014419972896576, "step": 583, "step_time": 23.880045936995884 }, { "clip_ratio/high_max": 0.004561936482787132, "clip_ratio/high_mean": 0.004561936482787132, "clip_ratio/low_mean": 0.005632458458421752, "clip_ratio/low_min": 0.005632458458421752, "clip_ratio/region_mean": 0.010194394853897393, "completions/clipped_ratio": 0.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 1344.5625, "completions/mean_terminated_length": 1344.5625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.043588779866695404, "epoch": 2.336000093440004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009022640064358711, "kl": 0.01748694630805403, "learning_rate": 7.4629799983410434e-06, "loss": 0.0001, "num_tokens": 12856973.0, "reward": 6.360016345977783, "reward_std": 24.379497528076172, "rewards/rollout_reward_func/mean": 6.360016345977783, "rewards/rollout_reward_func/std": 24.379497528076172, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.7578125, "sampling/sampling_logp_difference/mean": 0.31287676095962524, "step": 584, "step_time": 22.691895457013743 }, { "clip_ratio/high_max": 0.003273911075666547, "clip_ratio/high_mean": 0.003273911075666547, "clip_ratio/low_mean": 0.006786086596548557, "clip_ratio/low_min": 0.006786086596548557, "clip_ratio/region_mean": 0.010059997672215104, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 1415.75, "completions/mean_terminated_length": 1415.75, "completions/min_length": 1367.0, "completions/min_terminated_length": 1367.0, "entropy": 0.041735386941581964, "epoch": 2.3400000936000036e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013131143525242805, "kl": 0.018048734753392637, "learning_rate": 7.462979998334983e-06, "loss": 0.0001, "num_tokens": 12892503.0, "reward": -1.4601035118103027, "reward_std": 9.972737312316895, "rewards/rollout_reward_func/mean": -1.4601035118103027, "rewards/rollout_reward_func/std": 9.972737312316895, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.72267150878906, "sampling/sampling_logp_difference/mean": 0.31366145610809326, "step": 585, "step_time": 23.20914785399509 }, { "clip_ratio/high_max": 0.0027495775429997593, "clip_ratio/high_mean": 0.0027495775429997593, "clip_ratio/low_mean": 0.006552651844685897, "clip_ratio/low_min": 0.006552651844685897, "clip_ratio/region_mean": 0.009302229387685657, "completions/clipped_ratio": 0.0, "completions/max_length": 1552.0, "completions/max_terminated_length": 1552.0, "completions/mean_length": 1444.6875, "completions/mean_terminated_length": 1444.6875, "completions/min_length": 1350.0, "completions/min_terminated_length": 1350.0, "entropy": 0.03933595307171345, "epoch": 2.3440000937600037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 58.3470458984375, "kl": 1.9901334708556533, "learning_rate": 7.462979998328911e-06, "loss": 0.0143, "num_tokens": 12928513.0, "reward": 0.8705857992172241, "reward_std": 17.054161071777344, "rewards/rollout_reward_func/mean": 0.8705857992172241, "rewards/rollout_reward_func/std": 17.054161071777344, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.328125, "sampling/sampling_logp_difference/mean": 0.3015964925289154, "step": 586, "step_time": 23.639316744003736 }, { "clip_ratio/high_max": 0.0046628653653897345, "clip_ratio/high_mean": 0.0046628653653897345, "clip_ratio/low_mean": 0.004996626987121999, "clip_ratio/low_min": 0.004996626987121999, "clip_ratio/region_mean": 0.009659492294304073, "completions/clipped_ratio": 0.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 1413.9375, "completions/mean_terminated_length": 1413.9375, "completions/min_length": 1361.0, "completions/min_terminated_length": 1361.0, "entropy": 0.04125935910269618, "epoch": 2.3480000939200038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005913644563406706, "kl": 0.01677791483234614, "learning_rate": 7.46297999832283e-06, "loss": 0.0001, "num_tokens": 12964017.0, "reward": 3.7081849575042725, "reward_std": 11.717628479003906, "rewards/rollout_reward_func/mean": 3.7081849575042725, "rewards/rollout_reward_func/std": 11.717628479003906, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.5, "sampling/sampling_logp_difference/mean": 0.3250042498111725, "step": 587, "step_time": 22.915033338002104 }, { "clip_ratio/high_max": 0.0048603969044052064, "clip_ratio/high_mean": 0.0048603969044052064, "clip_ratio/low_mean": 0.004878586012637243, "clip_ratio/low_min": 0.004878586012637243, "clip_ratio/region_mean": 0.00973898294614628, "completions/clipped_ratio": 0.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 1431.0, "completions/mean_terminated_length": 1431.0, "completions/min_length": 1342.0, "completions/min_terminated_length": 1342.0, "entropy": 0.04016062058508396, "epoch": 2.352000094080004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005973602179437876, "kl": 0.017379582510329783, "learning_rate": 7.462979998316736e-06, "loss": 0.0001, "num_tokens": 12999805.0, "reward": 3.2145285606384277, "reward_std": 8.477996826171875, "rewards/rollout_reward_func/mean": 3.2145285606384277, "rewards/rollout_reward_func/std": 8.477996826171875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.4375, "sampling/sampling_logp_difference/mean": 0.3079354465007782, "step": 588, "step_time": 23.31014136000158 }, { "clip_ratio/high_max": 0.0037247418076731265, "clip_ratio/high_mean": 0.0037247418076731265, "clip_ratio/low_mean": 0.005220573832048103, "clip_ratio/low_min": 0.005220573832048103, "clip_ratio/region_mean": 0.00894531566882506, "completions/clipped_ratio": 0.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 1509.9375, "completions/mean_terminated_length": 1509.9375, "completions/min_length": 1473.0, "completions/min_terminated_length": 1473.0, "entropy": 0.03847635118290782, "epoch": 2.356000094240004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00690101832151413, "kl": 0.015629485016688704, "learning_rate": 7.4629799983106316e-06, "loss": 0.0001, "num_tokens": 13036918.0, "reward": 7.643494129180908, "reward_std": 15.474244117736816, "rewards/rollout_reward_func/mean": 7.643494129180908, "rewards/rollout_reward_func/std": 15.474244117736816, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.875, "sampling/sampling_logp_difference/mean": 0.2922111749649048, "step": 589, "step_time": 23.694791442998394 }, { "clip_ratio/high_max": 0.0056908829137682915, "clip_ratio/high_mean": 0.0056908829137682915, "clip_ratio/low_mean": 0.003706224961206317, "clip_ratio/low_min": 0.003706224961206317, "clip_ratio/region_mean": 0.009397107816766948, "completions/clipped_ratio": 0.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 1417.8125, "completions/mean_terminated_length": 1417.8125, "completions/min_length": 1327.0, "completions/min_terminated_length": 1327.0, "entropy": 0.03972746664658189, "epoch": 2.3600000944000037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028920068871229887, "kl": 0.013403187738731503, "learning_rate": 7.462979998304517e-06, "loss": 0.0001, "num_tokens": 13072481.0, "reward": -2.712306499481201, "reward_std": 7.532032489776611, "rewards/rollout_reward_func/mean": -2.712306499481201, "rewards/rollout_reward_func/std": 7.532032489776611, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.875, "sampling/sampling_logp_difference/mean": 0.30309903621673584, "step": 590, "step_time": 22.864581202004047 }, { "clip_ratio/high_max": 0.003738360130228102, "clip_ratio/high_mean": 0.003738360130228102, "clip_ratio/low_mean": 0.005810904694953933, "clip_ratio/low_min": 0.005810904694953933, "clip_ratio/region_mean": 0.009549264737870544, "completions/clipped_ratio": 0.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 1459.5, "completions/mean_terminated_length": 1459.5, "completions/min_length": 1360.0, "completions/min_terminated_length": 1360.0, "entropy": 0.03976095514371991, "epoch": 2.3640000945600037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005965039599686861, "kl": 0.014202530379407108, "learning_rate": 7.46297999829839e-06, "loss": 0.0001, "num_tokens": 13108739.0, "reward": 3.044342517852783, "reward_std": 8.820514678955078, "rewards/rollout_reward_func/mean": 3.044342517852783, "rewards/rollout_reward_func/std": 8.820514678955078, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 59.609375, "sampling/sampling_logp_difference/mean": 0.3119036555290222, "step": 591, "step_time": 23.667309522992582 }, { "clip_ratio/high_max": 0.005488661059644073, "clip_ratio/high_mean": 0.005488661059644073, "clip_ratio/low_mean": 0.0042902576678898185, "clip_ratio/low_min": 0.0042902576678898185, "clip_ratio/region_mean": 0.009778918873053044, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 1488.75, "completions/mean_terminated_length": 1488.75, "completions/min_length": 1289.0, "completions/min_terminated_length": 1289.0, "entropy": 0.03686735522933304, "epoch": 2.3680000947200038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004014178644865751, "kl": 0.013367212261073291, "learning_rate": 7.462979998292253e-06, "loss": 0.0001, "num_tokens": 13145504.0, "reward": 2.4551916122436523, "reward_std": 11.509414672851562, "rewards/rollout_reward_func/mean": 2.4551916122436523, "rewards/rollout_reward_func/std": 11.509416580200195, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.40625, "sampling/sampling_logp_difference/mean": 0.31832602620124817, "step": 592, "step_time": 23.587891075003427 }, { "clip_ratio/high_max": 0.004618623468559235, "clip_ratio/high_mean": 0.004618623468559235, "clip_ratio/low_mean": 0.004708003980340436, "clip_ratio/low_min": 0.004708003980340436, "clip_ratio/region_mean": 0.009326627478003502, "completions/clipped_ratio": 0.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 1419.9375, "completions/mean_terminated_length": 1419.9375, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "entropy": 0.03956747008487582, "epoch": 2.372000094880004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0061981710605323315, "kl": 0.01649990351870656, "learning_rate": 7.462979998286104e-06, "loss": 0.0001, "num_tokens": 13181114.0, "reward": -1.7228872776031494, "reward_std": 10.785075187683105, "rewards/rollout_reward_func/mean": -1.7228872776031494, "rewards/rollout_reward_func/std": 10.785075187683105, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.375, "sampling/sampling_logp_difference/mean": 0.3005006015300751, "step": 593, "step_time": 22.887749324014294 }, { "clip_ratio/high_max": 0.00414429436204955, "clip_ratio/high_mean": 0.00414429436204955, "clip_ratio/low_mean": 0.005090987833682448, "clip_ratio/low_min": 0.005090987833682448, "clip_ratio/region_mean": 0.009235282137524337, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 1474.125, "completions/mean_terminated_length": 1474.125, "completions/min_length": 1425.0, "completions/min_terminated_length": 1425.0, "entropy": 0.039656813722103834, "epoch": 2.3760000950400037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004318554885685444, "kl": 0.014461475773714483, "learning_rate": 7.462979998279945e-06, "loss": 0.0001, "num_tokens": 13217629.0, "reward": 0.6745476722717285, "reward_std": 8.530826568603516, "rewards/rollout_reward_func/mean": 0.6745476722717285, "rewards/rollout_reward_func/std": 8.530827522277832, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.3125, "sampling/sampling_logp_difference/mean": 0.3030755817890167, "step": 594, "step_time": 23.697669350993237 }, { "clip_ratio/high_max": 0.003868857224006206, "clip_ratio/high_mean": 0.003868857224006206, "clip_ratio/low_mean": 0.004754352616146207, "clip_ratio/low_min": 0.004754352616146207, "clip_ratio/region_mean": 0.008623209781944752, "completions/clipped_ratio": 0.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 1435.1875, "completions/mean_terminated_length": 1435.1875, "completions/min_length": 1359.0, "completions/min_terminated_length": 1359.0, "entropy": 0.038286334834992886, "epoch": 2.3800000952000037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003746014554053545, "kl": 0.013677611015737057, "learning_rate": 7.462979998273774e-06, "loss": 0.0001, "num_tokens": 13253502.0, "reward": -1.0157591104507446, "reward_std": 9.49451732635498, "rewards/rollout_reward_func/mean": -1.0157591104507446, "rewards/rollout_reward_func/std": 9.49451732635498, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.25, "sampling/sampling_logp_difference/mean": 0.29920077323913574, "step": 595, "step_time": 23.291635087007307 }, { "clip_ratio/high_max": 0.00573862119927071, "clip_ratio/high_mean": 0.00573862119927071, "clip_ratio/low_mean": 0.004308976698666811, "clip_ratio/low_min": 0.004308976698666811, "clip_ratio/region_mean": 0.010047598043456674, "completions/clipped_ratio": 0.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 1443.0625, "completions/mean_terminated_length": 1443.0625, "completions/min_length": 1347.0, "completions/min_terminated_length": 1347.0, "entropy": 0.03861889895051718, "epoch": 2.3840000953600038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00511526083573699, "kl": 0.01587518770247698, "learning_rate": 7.462979998267593e-06, "loss": 0.0001, "num_tokens": 13289492.0, "reward": -0.7131755352020264, "reward_std": 8.674261093139648, "rewards/rollout_reward_func/mean": -0.7131755352020264, "rewards/rollout_reward_func/std": 8.674261093139648, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.8125, "sampling/sampling_logp_difference/mean": 0.3075158894062042, "step": 596, "step_time": 23.641863801 }, { "clip_ratio/high_max": 0.00575239717727527, "clip_ratio/high_mean": 0.00575239717727527, "clip_ratio/low_mean": 0.003958936198614538, "clip_ratio/low_min": 0.003958936198614538, "clip_ratio/region_mean": 0.009711333317682147, "completions/clipped_ratio": 0.0, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 1461.5, "completions/mean_terminated_length": 1461.5, "completions/min_length": 1413.0, "completions/min_terminated_length": 1413.0, "entropy": 0.03836348047479987, "epoch": 2.388000095520004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003597307950258255, "kl": 0.014995739329606295, "learning_rate": 7.4629799982614e-06, "loss": 0.0001, "num_tokens": 13325788.0, "reward": -3.2889058589935303, "reward_std": 10.004575729370117, "rewards/rollout_reward_func/mean": -3.2889058589935303, "rewards/rollout_reward_func/std": 10.004576683044434, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.09375, "sampling/sampling_logp_difference/mean": 0.3073439598083496, "step": 597, "step_time": 23.663796289001766 }, { "clip_ratio/high_max": 0.0042539036949165165, "clip_ratio/high_mean": 0.0042539036949165165, "clip_ratio/low_mean": 0.005813063005916774, "clip_ratio/low_min": 0.005813063005916774, "clip_ratio/region_mean": 0.010066966759040952, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1409.875, "completions/mean_terminated_length": 1409.875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.03852875903248787, "epoch": 2.392000095680004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004867880139499903, "kl": 0.016544350190088153, "learning_rate": 7.462979998255196e-06, "loss": 0.0001, "num_tokens": 13361274.0, "reward": 4.203113555908203, "reward_std": 24.949649810791016, "rewards/rollout_reward_func/mean": 4.203113555908203, "rewards/rollout_reward_func/std": 24.949649810791016, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.263675689697266, "sampling/sampling_logp_difference/mean": 0.3083210587501526, "step": 598, "step_time": 23.230083817004925 }, { "clip_ratio/high_max": 0.0030810884200036526, "clip_ratio/high_mean": 0.0030810884200036526, "clip_ratio/low_mean": 0.005889988882699981, "clip_ratio/low_min": 0.005889988882699981, "clip_ratio/region_mean": 0.008971077390015125, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 1442.375, "completions/mean_terminated_length": 1442.375, "completions/min_length": 1371.0, "completions/min_terminated_length": 1371.0, "entropy": 0.039931037463247776, "epoch": 2.3960000958400037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.1441921442747116, "kl": 0.0420718090608716, "learning_rate": 7.462979998248982e-06, "loss": 0.0003, "num_tokens": 13397256.0, "reward": 10.358328819274902, "reward_std": 19.085145950317383, "rewards/rollout_reward_func/mean": 10.358328819274902, "rewards/rollout_reward_func/std": 19.085145950317383, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.84375, "sampling/sampling_logp_difference/mean": 0.3201407194137573, "step": 599, "step_time": 23.241608034011733 }, { "clip_ratio/high_max": 0.005420194938778877, "clip_ratio/high_mean": 0.005420194938778877, "clip_ratio/low_mean": 0.00438696620403789, "clip_ratio/low_min": 0.00438696620403789, "clip_ratio/region_mean": 0.009807161171920598, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 1498.75, "completions/mean_terminated_length": 1498.75, "completions/min_length": 1434.0, "completions/min_terminated_length": 1434.0, "entropy": 0.03856422519311309, "epoch": 2.4000000960000038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005353051237761974, "kl": 0.014600334805436432, "learning_rate": 7.462979998242756e-06, "loss": 0.0001, "num_tokens": 13434185.0, "reward": 0.8973138332366943, "reward_std": 10.975991249084473, "rewards/rollout_reward_func/mean": 0.8973138332366943, "rewards/rollout_reward_func/std": 10.975991249084473, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.0, "sampling/sampling_logp_difference/mean": 0.3059206008911133, "step": 600, "step_time": 23.75709896000626 }, { "clip_ratio/high_max": 0.005453487392514944, "clip_ratio/high_mean": 0.005453487392514944, "clip_ratio/low_mean": 0.004606309899827465, "clip_ratio/low_min": 0.004606309899827465, "clip_ratio/region_mean": 0.01005979732144624, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 1476.8125, "completions/mean_terminated_length": 1476.8125, "completions/min_length": 1363.0, "completions/min_terminated_length": 1363.0, "entropy": 0.03879501298069954, "epoch": 2.404000096160004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032868608832359314, "kl": 0.01572067488450557, "learning_rate": 7.462979998236519e-06, "loss": 0.0001, "num_tokens": 13470744.0, "reward": -4.482424736022949, "reward_std": 13.82524585723877, "rewards/rollout_reward_func/mean": -4.482424736022949, "rewards/rollout_reward_func/std": 13.825244903564453, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.40625, "sampling/sampling_logp_difference/mean": 0.2939872741699219, "step": 601, "step_time": 23.68818607400317 }, { "clip_ratio/high_max": 0.0055496785207651556, "clip_ratio/high_mean": 0.0055496785207651556, "clip_ratio/low_mean": 0.004048458707984537, "clip_ratio/low_min": 0.004048458707984537, "clip_ratio/region_mean": 0.009598137228749692, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 1471.0, "completions/mean_terminated_length": 1471.0, "completions/min_length": 1380.0, "completions/min_terminated_length": 1380.0, "entropy": 0.03785919491201639, "epoch": 2.408000096320004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003585171652957797, "kl": 0.01584290445316583, "learning_rate": 7.46297999823027e-06, "loss": 0.0001, "num_tokens": 13507210.0, "reward": -0.6719400882720947, "reward_std": 12.167032241821289, "rewards/rollout_reward_func/mean": -0.6719400882720947, "rewards/rollout_reward_func/std": 12.167032241821289, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.796875, "sampling/sampling_logp_difference/mean": 0.29947128891944885, "step": 602, "step_time": 23.5923994239929 }, { "clip_ratio/high_max": 0.004009076248621568, "clip_ratio/high_mean": 0.004009076248621568, "clip_ratio/low_mean": 0.005768772185547277, "clip_ratio/low_min": 0.005768772185547277, "clip_ratio/region_mean": 0.009777848434168845, "completions/clipped_ratio": 0.0, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 1444.0, "completions/mean_terminated_length": 1444.0, "completions/min_length": 1236.0, "completions/min_terminated_length": 1236.0, "entropy": 0.038023282308131456, "epoch": 2.4120000964800037e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028544175438582897, "kl": 0.017391433590091765, "learning_rate": 7.4629799982240114e-06, "loss": 0.0001, "num_tokens": 13543214.0, "reward": 4.8498992919921875, "reward_std": 14.10338306427002, "rewards/rollout_reward_func/mean": 4.8498992919921875, "rewards/rollout_reward_func/std": 14.103382110595703, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.84375, "sampling/sampling_logp_difference/mean": 0.30536383390426636, "step": 603, "step_time": 23.699993847010774 }, { "clip_ratio/high_max": 0.003998039799625985, "clip_ratio/high_mean": 0.003998039799625985, "clip_ratio/low_mean": 0.0044249270868021995, "clip_ratio/low_min": 0.0044249270868021995, "clip_ratio/region_mean": 0.008422966988291591, "completions/clipped_ratio": 0.0, "completions/max_length": 1548.0, "completions/max_terminated_length": 1548.0, "completions/mean_length": 1440.6875, "completions/mean_terminated_length": 1440.6875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.03878716006875038, "epoch": 2.4160000966400038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007868223823606968, "kl": 0.016277158982120454, "learning_rate": 7.462979998217742e-06, "loss": 0.0001, "num_tokens": 13579237.0, "reward": 9.243125915527344, "reward_std": 27.213899612426758, "rewards/rollout_reward_func/mean": 9.243125915527344, "rewards/rollout_reward_func/std": 27.213899612426758, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.71875, "sampling/sampling_logp_difference/mean": 0.2968836724758148, "step": 604, "step_time": 23.470122153994453 }, { "clip_ratio/high_max": 0.0028178628999739885, "clip_ratio/high_mean": 0.0028178628999739885, "clip_ratio/low_mean": 0.006664119253400713, "clip_ratio/low_min": 0.006664119253400713, "clip_ratio/region_mean": 0.009481981978751719, "completions/clipped_ratio": 0.0, "completions/max_length": 1531.0, "completions/max_terminated_length": 1531.0, "completions/mean_length": 1386.5625, "completions/mean_terminated_length": 1386.5625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.03881756588816643, "epoch": 2.420000096800004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038775671273469925, "kl": 0.020610704785212874, "learning_rate": 7.462979998211461e-06, "loss": 0.0001, "num_tokens": 13614336.0, "reward": 6.8249077796936035, "reward_std": 26.408491134643555, "rewards/rollout_reward_func/mean": 6.8249077796936035, "rewards/rollout_reward_func/std": 26.40849494934082, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.140625, "sampling/sampling_logp_difference/mean": 0.296610027551651, "step": 605, "step_time": 23.071633282997936 }, { "clip_ratio/high_max": 0.006233465624973178, "clip_ratio/high_mean": 0.006233465624973178, "clip_ratio/low_mean": 0.0033954515820369124, "clip_ratio/low_min": 0.0033954515820369124, "clip_ratio/region_mean": 0.009628917323425412, "completions/clipped_ratio": 0.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 1453.4375, "completions/mean_terminated_length": 1453.4375, "completions/min_length": 1371.0, "completions/min_terminated_length": 1371.0, "entropy": 0.03946748282760382, "epoch": 2.424000096960004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005397938657552004, "kl": 0.020950178732164204, "learning_rate": 7.462979998205168e-06, "loss": 0.0001, "num_tokens": 13650504.0, "reward": -3.239546298980713, "reward_std": 13.186673164367676, "rewards/rollout_reward_func/mean": -3.239546298980713, "rewards/rollout_reward_func/std": 13.18667221069336, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.1875, "sampling/sampling_logp_difference/mean": 0.29635676741600037, "step": 606, "step_time": 23.387494521004555 }, { "clip_ratio/high_max": 0.005361852759961039, "clip_ratio/high_mean": 0.005361852759961039, "clip_ratio/low_mean": 0.0036810009623877704, "clip_ratio/low_min": 0.0036810009623877704, "clip_ratio/region_mean": 0.00904285378055647, "completions/clipped_ratio": 0.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 1459.75, "completions/mean_terminated_length": 1459.75, "completions/min_length": 1334.0, "completions/min_terminated_length": 1334.0, "entropy": 0.03921219427138567, "epoch": 2.428000097120004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02097811922430992, "kl": 0.023674527765251696, "learning_rate": 7.4629799981988656e-06, "loss": 0.0002, "num_tokens": 13686778.0, "reward": -3.1671953201293945, "reward_std": 11.199559211730957, "rewards/rollout_reward_func/mean": -3.1671953201293945, "rewards/rollout_reward_func/std": 11.199560165405273, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.34375, "sampling/sampling_logp_difference/mean": 0.2970430552959442, "step": 607, "step_time": 23.377579579006124 }, { "clip_ratio/high_max": 0.0037056659057270736, "clip_ratio/high_mean": 0.0037056659057270736, "clip_ratio/low_mean": 0.004538075590971857, "clip_ratio/low_min": 0.004538075590971857, "clip_ratio/region_mean": 0.008243741584010422, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1397.4375, "completions/mean_terminated_length": 1397.4375, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "entropy": 0.04135293187573552, "epoch": 2.4320000972800038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004567116964608431, "kl": 0.024052680586464703, "learning_rate": 7.462979998192551e-06, "loss": 0.0002, "num_tokens": 13722022.0, "reward": 7.896929740905762, "reward_std": 30.649133682250977, "rewards/rollout_reward_func/mean": 7.896929740905762, "rewards/rollout_reward_func/std": 30.64913558959961, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.390625, "sampling/sampling_logp_difference/mean": 0.31001466512680054, "step": 608, "step_time": 23.588214365998283 }, { "clip_ratio/high_max": 0.0036002055858261883, "clip_ratio/high_mean": 0.0036002055858261883, "clip_ratio/low_mean": 0.005590512359049171, "clip_ratio/low_min": 0.005590512359049171, "clip_ratio/region_mean": 0.009190717828460038, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 1442.375, "completions/mean_terminated_length": 1442.375, "completions/min_length": 1338.0, "completions/min_terminated_length": 1338.0, "entropy": 0.04004529817029834, "epoch": 2.436000097440004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024291686713695526, "kl": 0.0164652313105762, "learning_rate": 7.462979998186226e-06, "loss": 0.0001, "num_tokens": 13757995.0, "reward": 2.5902585983276367, "reward_std": 10.034664154052734, "rewards/rollout_reward_func/mean": 2.5902585983276367, "rewards/rollout_reward_func/std": 10.034664154052734, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.8125, "sampling/sampling_logp_difference/mean": 0.29937976598739624, "step": 609, "step_time": 23.71380222899461 }, { "clip_ratio/high_max": 0.005044898425694555, "clip_ratio/high_mean": 0.005044898425694555, "clip_ratio/low_mean": 0.004048845905344933, "clip_ratio/low_min": 0.004048845905344933, "clip_ratio/region_mean": 0.009093744331039488, "completions/clipped_ratio": 0.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 1427.125, "completions/mean_terminated_length": 1427.125, "completions/min_length": 1345.0, "completions/min_terminated_length": 1345.0, "entropy": 0.04123760247603059, "epoch": 2.440000097600004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003163304179906845, "kl": 0.016641977825202048, "learning_rate": 7.4629799981798895e-06, "loss": 0.0001, "num_tokens": 13793729.0, "reward": -1.568770408630371, "reward_std": 17.653400421142578, "rewards/rollout_reward_func/mean": -1.568770408630371, "rewards/rollout_reward_func/std": 17.65340232849121, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.5625, "sampling/sampling_logp_difference/mean": 0.2991776764392853, "step": 610, "step_time": 22.795218264996947 }, { "clip_ratio/high_max": 0.0046766853192821145, "clip_ratio/high_mean": 0.0046766853192821145, "clip_ratio/low_mean": 0.003869916865369305, "clip_ratio/low_min": 0.003869916865369305, "clip_ratio/region_mean": 0.008546602097339928, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1459.0625, "completions/mean_terminated_length": 1459.0625, "completions/min_length": 1369.0, "completions/min_terminated_length": 1369.0, "entropy": 0.03995579853653908, "epoch": 2.444000097760004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008720638230443, "kl": 0.021532187703996897, "learning_rate": 7.462979998173542e-06, "loss": 0.0002, "num_tokens": 13829985.0, "reward": 5.819354057312012, "reward_std": 11.420256614685059, "rewards/rollout_reward_func/mean": 5.819354057312012, "rewards/rollout_reward_func/std": 11.420257568359375, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.90625, "sampling/sampling_logp_difference/mean": 0.2979537546634674, "step": 611, "step_time": 23.765924700011965 }, { "clip_ratio/high_max": 0.005916538735618815, "clip_ratio/high_mean": 0.005916538735618815, "clip_ratio/low_mean": 0.00292776437709108, "clip_ratio/low_min": 0.00292776437709108, "clip_ratio/region_mean": 0.008844303141813725, "completions/clipped_ratio": 0.0, "completions/max_length": 1518.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 1441.3125, "completions/mean_terminated_length": 1441.3125, "completions/min_length": 1358.0, "completions/min_terminated_length": 1358.0, "entropy": 0.039690813049674034, "epoch": 2.4480000979200038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010388987138867378, "kl": 0.02161763736512512, "learning_rate": 7.462979998167183e-06, "loss": 0.0002, "num_tokens": 13865948.0, "reward": 3.654644250869751, "reward_std": 15.416064262390137, "rewards/rollout_reward_func/mean": 3.654644250869751, "rewards/rollout_reward_func/std": 15.416065216064453, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.9375, "sampling/sampling_logp_difference/mean": 0.2888951301574707, "step": 612, "step_time": 23.233716366994486 }, { "clip_ratio/high_max": 0.005030653963331133, "clip_ratio/high_mean": 0.005030653963331133, "clip_ratio/low_mean": 0.004588433250319213, "clip_ratio/low_min": 0.004588433250319213, "clip_ratio/region_mean": 0.009619087213650346, "completions/clipped_ratio": 0.0, "completions/max_length": 1468.0, "completions/max_terminated_length": 1468.0, "completions/mean_length": 1333.9375, "completions/mean_terminated_length": 1333.9375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.04131761658936739, "epoch": 2.452000098080004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0020726535003632307, "kl": 0.019978290190920234, "learning_rate": 7.462979998160814e-06, "loss": 0.0001, "num_tokens": 13900169.0, "reward": 6.255397796630859, "reward_std": 22.424846649169922, "rewards/rollout_reward_func/mean": 6.255397796630859, "rewards/rollout_reward_func/std": 22.424846649169922, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.4765625, "sampling/sampling_logp_difference/mean": 0.312170147895813, "step": 613, "step_time": 22.977475496998522 }, { "clip_ratio/high_max": 0.00528125959681347, "clip_ratio/high_mean": 0.00528125959681347, "clip_ratio/low_mean": 0.004306683491449803, "clip_ratio/low_min": 0.004306683491449803, "clip_ratio/region_mean": 0.009587943088263273, "completions/clipped_ratio": 0.0, "completions/max_length": 1533.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 1455.75, "completions/mean_terminated_length": 1455.75, "completions/min_length": 1345.0, "completions/min_terminated_length": 1345.0, "entropy": 0.039759257808327675, "epoch": 2.456000098240004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003950444515794516, "kl": 0.01766972418408841, "learning_rate": 7.462979998154434e-06, "loss": 0.0001, "num_tokens": 13936362.0, "reward": 5.920368194580078, "reward_std": 14.681482315063477, "rewards/rollout_reward_func/mean": 5.920368194580078, "rewards/rollout_reward_func/std": 14.681482315063477, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.5625, "sampling/sampling_logp_difference/mean": 0.2951616942882538, "step": 614, "step_time": 23.47362800000701 }, { "clip_ratio/high_max": 0.0037022740580141544, "clip_ratio/high_mean": 0.0037022740580141544, "clip_ratio/low_mean": 0.005705508170649409, "clip_ratio/low_min": 0.005705508170649409, "clip_ratio/region_mean": 0.009407782286871225, "completions/clipped_ratio": 0.0, "completions/max_length": 1484.0, "completions/max_terminated_length": 1484.0, "completions/mean_length": 1434.875, "completions/mean_terminated_length": 1434.875, "completions/min_length": 1389.0, "completions/min_terminated_length": 1389.0, "entropy": 0.04025285551324487, "epoch": 2.460000098400004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004532979801297188, "kl": 0.02036779443733394, "learning_rate": 7.462979998148042e-06, "loss": 0.0001, "num_tokens": 13972219.0, "reward": 1.348813533782959, "reward_std": 9.142998695373535, "rewards/rollout_reward_func/mean": 1.348813533782959, "rewards/rollout_reward_func/std": 9.142999649047852, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.234375, "sampling/sampling_logp_difference/mean": 0.3068994879722595, "step": 615, "step_time": 22.94497079499706 }, { "clip_ratio/high_max": 0.004420596291311085, "clip_ratio/high_mean": 0.004420596291311085, "clip_ratio/low_mean": 0.0037436033599078655, "clip_ratio/low_min": 0.0037436033599078655, "clip_ratio/region_mean": 0.008164199709426612, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 1446.5, "completions/mean_terminated_length": 1446.5, "completions/min_length": 1380.0, "completions/min_terminated_length": 1380.0, "entropy": 0.04070087056607008, "epoch": 2.464000098560004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033986761700361967, "kl": 0.017584501765668392, "learning_rate": 7.462979998141639e-06, "loss": 0.0001, "num_tokens": 14008269.0, "reward": -2.785426378250122, "reward_std": 14.24686336517334, "rewards/rollout_reward_func/mean": -2.785426378250122, "rewards/rollout_reward_func/std": 14.246864318847656, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.78125, "sampling/sampling_logp_difference/mean": 0.30103930830955505, "step": 616, "step_time": 23.349111473005905 }, { "clip_ratio/high_max": 0.003953650943003595, "clip_ratio/high_mean": 0.003953650943003595, "clip_ratio/low_mean": 0.005015617934986949, "clip_ratio/low_min": 0.005015617934986949, "clip_ratio/region_mean": 0.008969268936198205, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 1449.9375, "completions/mean_terminated_length": 1449.9375, "completions/min_length": 1364.0, "completions/min_terminated_length": 1364.0, "entropy": 0.0397700103931129, "epoch": 2.4680000987200038e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.019726324826478958, "kl": 0.02526660426519811, "learning_rate": 7.4629799981352254e-06, "loss": 0.0002, "num_tokens": 14044372.0, "reward": 4.246125221252441, "reward_std": 15.763184547424316, "rewards/rollout_reward_func/mean": 4.246125221252441, "rewards/rollout_reward_func/std": 15.763185501098633, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.843753814697266, "sampling/sampling_logp_difference/mean": 0.30441945791244507, "step": 617, "step_time": 23.328549179008405 }, { "clip_ratio/high_max": 0.003787545021623373, "clip_ratio/high_mean": 0.003787545021623373, "clip_ratio/low_mean": 0.004822011600481346, "clip_ratio/low_min": 0.004822011600481346, "clip_ratio/region_mean": 0.008609556593000889, "completions/clipped_ratio": 0.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 1475.125, "completions/mean_terminated_length": 1475.125, "completions/min_length": 1378.0, "completions/min_terminated_length": 1378.0, "entropy": 0.03692349651828408, "epoch": 2.472000098880004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002474582754075527, "kl": 0.01627051131799817, "learning_rate": 7.462979998128801e-06, "loss": 0.0001, "num_tokens": 14080902.0, "reward": 1.1310646533966064, "reward_std": 15.543924331665039, "rewards/rollout_reward_func/mean": 1.1310646533966064, "rewards/rollout_reward_func/std": 15.543924331665039, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.625, "sampling/sampling_logp_difference/mean": 0.2957988381385803, "step": 618, "step_time": 23.691768979013432 }, { "clip_ratio/high_max": 0.00407232268480584, "clip_ratio/high_mean": 0.00407232268480584, "clip_ratio/low_mean": 0.0046218535280786455, "clip_ratio/low_min": 0.0046218535280786455, "clip_ratio/region_mean": 0.008694176271092147, "completions/clipped_ratio": 0.0, "completions/max_length": 1541.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 1345.125, "completions/mean_terminated_length": 1345.125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.0397958573885262, "epoch": 2.476000099040004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025713990908116102, "kl": 0.019760260242037475, "learning_rate": 7.462979998122366e-06, "loss": 0.0001, "num_tokens": 14115297.0, "reward": 6.935085296630859, "reward_std": 29.73638343811035, "rewards/rollout_reward_func/mean": 6.935085296630859, "rewards/rollout_reward_func/std": 29.736385345458984, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.71875, "sampling/sampling_logp_difference/mean": 0.30841508507728577, "step": 619, "step_time": 23.30434032301855 }, { "clip_ratio/high_max": 0.004939553327858448, "clip_ratio/high_mean": 0.004939553327858448, "clip_ratio/low_mean": 0.003696045750984922, "clip_ratio/low_min": 0.003696045750984922, "clip_ratio/region_mean": 0.008635599340777844, "completions/clipped_ratio": 0.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 1428.8125, "completions/mean_terminated_length": 1428.8125, "completions/min_length": 1367.0, "completions/min_terminated_length": 1367.0, "entropy": 0.0406498983502388, "epoch": 2.480000099200004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002107823733240366, "kl": 0.019439433701336384, "learning_rate": 7.462979998115919e-06, "loss": 0.0001, "num_tokens": 14151034.0, "reward": -0.8411007523536682, "reward_std": 8.012909889221191, "rewards/rollout_reward_func/mean": -0.8411007523536682, "rewards/rollout_reward_func/std": 8.012909889221191, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.28125, "sampling/sampling_logp_difference/mean": 0.2946208715438843, "step": 620, "step_time": 23.071646383010375 }, { "clip_ratio/high_max": 0.0036436041846172884, "clip_ratio/high_mean": 0.0036436041846172884, "clip_ratio/low_mean": 0.005384216958191246, "clip_ratio/low_min": 0.005384216958191246, "clip_ratio/region_mean": 0.00902782124467194, "completions/clipped_ratio": 0.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 1308.6875, "completions/mean_terminated_length": 1308.6875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.04004904069006443, "epoch": 2.484000099360004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.025570053607225418, "kl": 0.021920042228884995, "learning_rate": 7.462979998109461e-06, "loss": 0.0001, "num_tokens": 14184888.0, "reward": 13.582442283630371, "reward_std": 31.186359405517578, "rewards/rollout_reward_func/mean": 13.582442283630371, "rewards/rollout_reward_func/std": 31.18636131286621, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.65625, "sampling/sampling_logp_difference/mean": 0.3062202036380768, "step": 621, "step_time": 22.622284086995933 }, { "clip_ratio/high_max": 0.005940805043792352, "clip_ratio/high_mean": 0.005940805043792352, "clip_ratio/low_mean": 0.0026430479483678937, "clip_ratio/low_min": 0.0026430479483678937, "clip_ratio/region_mean": 0.008583853021264076, "completions/clipped_ratio": 0.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 1424.0, "completions/mean_terminated_length": 1424.0, "completions/min_length": 1339.0, "completions/min_terminated_length": 1339.0, "entropy": 0.04059934290125966, "epoch": 2.488000099520004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022626426070928574, "kl": 0.018971996614709496, "learning_rate": 7.462979998102992e-06, "loss": 0.0001, "num_tokens": 14220568.0, "reward": -2.0982656478881836, "reward_std": 12.412862777709961, "rewards/rollout_reward_func/mean": -2.0982656478881836, "rewards/rollout_reward_func/std": 12.412863731384277, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.875, "sampling/sampling_logp_difference/mean": 0.3029346764087677, "step": 622, "step_time": 23.061708426983387 }, { "clip_ratio/high_max": 0.005395130574470386, "clip_ratio/high_mean": 0.005395130574470386, "clip_ratio/low_mean": 0.003916296118404716, "clip_ratio/low_min": 0.003916296118404716, "clip_ratio/region_mean": 0.00931142660556361, "completions/clipped_ratio": 0.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 1437.1875, "completions/mean_terminated_length": 1437.1875, "completions/min_length": 1357.0, "completions/min_terminated_length": 1357.0, "entropy": 0.04140823986381292, "epoch": 2.492000099680004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002768352860584855, "kl": 0.01505141076631844, "learning_rate": 7.462979998096512e-06, "loss": 0.0001, "num_tokens": 14256471.0, "reward": 1.627608299255371, "reward_std": 12.571651458740234, "rewards/rollout_reward_func/mean": 1.627608299255371, "rewards/rollout_reward_func/std": 12.57165241241455, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.5625, "sampling/sampling_logp_difference/mean": 0.29546603560447693, "step": 623, "step_time": 22.958934577996843 }, { "clip_ratio/high_max": 0.0032604036387056112, "clip_ratio/high_mean": 0.0032604036387056112, "clip_ratio/low_mean": 0.0050046793185174465, "clip_ratio/low_min": 0.0050046793185174465, "clip_ratio/region_mean": 0.008265082957223058, "completions/clipped_ratio": 0.0, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 1451.75, "completions/mean_terminated_length": 1451.75, "completions/min_length": 1343.0, "completions/min_terminated_length": 1343.0, "entropy": 0.039777443977072835, "epoch": 2.496000099840004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003505127504467964, "kl": 0.01949278498068452, "learning_rate": 7.462979998090021e-06, "loss": 0.0001, "num_tokens": 14292609.0, "reward": -3.306396007537842, "reward_std": 8.676206588745117, "rewards/rollout_reward_func/mean": -3.306396007537842, "rewards/rollout_reward_func/std": 8.676206588745117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.937503814697266, "sampling/sampling_logp_difference/mean": 0.28876161575317383, "step": 624, "step_time": 23.63000615900819 }, { "clip_ratio/high_max": 0.0032338396413251758, "clip_ratio/high_mean": 0.0032338396413251758, "clip_ratio/low_mean": 0.005678031389834359, "clip_ratio/low_min": 0.005678031389834359, "clip_ratio/region_mean": 0.008911871118471026, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 1352.0, "completions/mean_terminated_length": 1352.0, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.03942304756492376, "epoch": 2.500000100000004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012139063328504562, "kl": 0.02348474517930299, "learning_rate": 7.462979998083519e-06, "loss": 0.0002, "num_tokens": 14327130.0, "reward": 5.092710494995117, "reward_std": 23.095314025878906, "rewards/rollout_reward_func/mean": 5.092710494995117, "rewards/rollout_reward_func/std": 23.095314025878906, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.187503814697266, "sampling/sampling_logp_difference/mean": 0.300838828086853, "step": 625, "step_time": 23.358139850999578 }, { "clip_ratio/high_max": 0.0039022982819005847, "clip_ratio/high_mean": 0.0039022982819005847, "clip_ratio/low_mean": 0.006455884344177321, "clip_ratio/low_min": 0.006455884344177321, "clip_ratio/region_mean": 0.010358182713389397, "completions/clipped_ratio": 0.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 1383.875, "completions/mean_terminated_length": 1383.875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.03770908643491566, "epoch": 2.504000100160004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.08417172729969025, "kl": 0.056275439099408686, "learning_rate": 7.462979998077006e-06, "loss": 0.0003, "num_tokens": 14362174.0, "reward": 7.087791919708252, "reward_std": 22.166337966918945, "rewards/rollout_reward_func/mean": 7.087791919708252, "rewards/rollout_reward_func/std": 22.166337966918945, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.875, "sampling/sampling_logp_difference/mean": 0.30378881096839905, "step": 626, "step_time": 22.80021358099475 }, { "clip_ratio/high_max": 0.004318095016060397, "clip_ratio/high_mean": 0.004318095016060397, "clip_ratio/low_mean": 0.004211250692605972, "clip_ratio/low_min": 0.004211250692605972, "clip_ratio/region_mean": 0.00852934579597786, "completions/clipped_ratio": 0.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 1467.75, "completions/mean_terminated_length": 1467.75, "completions/min_length": 1351.0, "completions/min_terminated_length": 1351.0, "entropy": 0.03923634719103575, "epoch": 2.508000100320004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0023460208903998137, "kl": 0.016297273454256356, "learning_rate": 7.462979998070482e-06, "loss": 0.0001, "num_tokens": 14398578.0, "reward": 4.784086227416992, "reward_std": 13.151532173156738, "rewards/rollout_reward_func/mean": 4.784086227416992, "rewards/rollout_reward_func/std": 13.151532173156738, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.375, "sampling/sampling_logp_difference/mean": 0.2978185713291168, "step": 627, "step_time": 23.91257645200676 }, { "clip_ratio/high_max": 0.00507410874706693, "clip_ratio/high_mean": 0.00507410874706693, "clip_ratio/low_mean": 0.003987316100392491, "clip_ratio/low_min": 0.003987316100392491, "clip_ratio/region_mean": 0.00906142476014793, "completions/clipped_ratio": 0.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 1415.5, "completions/mean_terminated_length": 1415.5, "completions/min_length": 1322.0, "completions/min_terminated_length": 1322.0, "entropy": 0.03964635077863932, "epoch": 2.512000100480004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037915303837507963, "kl": 0.019179450115188956, "learning_rate": 7.462979998063947e-06, "loss": 0.0001, "num_tokens": 14434108.0, "reward": 4.648604393005371, "reward_std": 16.838850021362305, "rewards/rollout_reward_func/mean": 4.648604393005371, "rewards/rollout_reward_func/std": 16.838850021362305, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.47126007080078, "sampling/sampling_logp_difference/mean": 0.2978397011756897, "step": 628, "step_time": 22.816523005996714 }, { "clip_ratio/high_max": 0.0031705920991953462, "clip_ratio/high_mean": 0.0031705920991953462, "clip_ratio/low_mean": 0.003875362657709047, "clip_ratio/low_min": 0.003875362657709047, "clip_ratio/region_mean": 0.0070459547569043934, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 1429.5625, "completions/mean_terminated_length": 1429.5625, "completions/min_length": 1350.0, "completions/min_terminated_length": 1350.0, "entropy": 0.038175603840500116, "epoch": 2.516000100640004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005297460593283176, "kl": 0.014218574739061296, "learning_rate": 7.4629799980574e-06, "loss": 0.0001, "num_tokens": 14469861.0, "reward": 0.639496922492981, "reward_std": 6.40993595123291, "rewards/rollout_reward_func/mean": 0.639496922492981, "rewards/rollout_reward_func/std": 6.409936428070068, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.84765625, "sampling/sampling_logp_difference/mean": 0.29372602701187134, "step": 629, "step_time": 23.39672909599176 }, { "clip_ratio/high_max": 0.004662851570174098, "clip_ratio/high_mean": 0.004662851570174098, "clip_ratio/low_mean": 0.0040078510355670005, "clip_ratio/low_min": 0.0040078510355670005, "clip_ratio/region_mean": 0.008670702634844929, "completions/clipped_ratio": 0.0, "completions/max_length": 1459.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 1413.1875, "completions/mean_terminated_length": 1413.1875, "completions/min_length": 1360.0, "completions/min_terminated_length": 1360.0, "entropy": 0.04053363995626569, "epoch": 2.5200001008000042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.28009214997291565, "kl": 0.05578337248880416, "learning_rate": 7.462979998050843e-06, "loss": 0.0004, "num_tokens": 14505348.0, "reward": -1.3817188739776611, "reward_std": 8.789987564086914, "rewards/rollout_reward_func/mean": -1.3817188739776611, "rewards/rollout_reward_func/std": 8.789987564086914, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.625, "sampling/sampling_logp_difference/mean": 0.30179956555366516, "step": 630, "step_time": 27.34855897201487 }, { "clip_ratio/high_max": 0.0036451105843298137, "clip_ratio/high_mean": 0.0036451105843298137, "clip_ratio/low_mean": 0.005617400805931538, "clip_ratio/low_min": 0.005617400805931538, "clip_ratio/region_mean": 0.00926251127384603, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 1445.375, "completions/mean_terminated_length": 1445.375, "completions/min_length": 1348.0, "completions/min_terminated_length": 1348.0, "entropy": 0.03945912839844823, "epoch": 2.524000100960004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003733268240466714, "kl": 0.019791533704847097, "learning_rate": 7.462979998044275e-06, "loss": 0.0001, "num_tokens": 14541381.0, "reward": 2.5819053649902344, "reward_std": 12.297783851623535, "rewards/rollout_reward_func/mean": 2.5819053649902344, "rewards/rollout_reward_func/std": 12.297783851623535, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.852569580078125, "sampling/sampling_logp_difference/mean": 0.30125507712364197, "step": 631, "step_time": 23.791827941997326 }, { "clip_ratio/high_max": 0.0035949612502008677, "clip_ratio/high_mean": 0.0035949612502008677, "clip_ratio/low_mean": 0.004465228441404179, "clip_ratio/low_min": 0.004465228441404179, "clip_ratio/region_mean": 0.008060189778916538, "completions/clipped_ratio": 0.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 1468.9375, "completions/mean_terminated_length": 1468.9375, "completions/min_length": 1358.0, "completions/min_terminated_length": 1358.0, "entropy": 0.038147898856550455, "epoch": 2.528000101120004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004145969171077013, "kl": 0.019359736586920917, "learning_rate": 7.462979998037695e-06, "loss": 0.0001, "num_tokens": 14577813.0, "reward": -0.29856014251708984, "reward_std": 7.201072692871094, "rewards/rollout_reward_func/mean": -0.29856014251708984, "rewards/rollout_reward_func/std": 7.201073169708252, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.625, "sampling/sampling_logp_difference/mean": 0.29074662923812866, "step": 632, "step_time": 23.991089149996697 }, { "clip_ratio/high_max": 0.003900296811480075, "clip_ratio/high_mean": 0.003900296811480075, "clip_ratio/low_mean": 0.006153611349873245, "clip_ratio/low_min": 0.006153611349873245, "clip_ratio/region_mean": 0.01005390821956098, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 1443.5, "completions/mean_terminated_length": 1443.5, "completions/min_length": 1349.0, "completions/min_terminated_length": 1349.0, "entropy": 0.040235049556940794, "epoch": 2.532000101280004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002762790536507964, "kl": 0.021143753780052066, "learning_rate": 7.462979998031104e-06, "loss": 0.0001, "num_tokens": 14613806.0, "reward": -1.0061484575271606, "reward_std": 9.62979793548584, "rewards/rollout_reward_func/mean": -1.0061484575271606, "rewards/rollout_reward_func/std": 9.629798889160156, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.421875, "sampling/sampling_logp_difference/mean": 0.29450225830078125, "step": 633, "step_time": 23.699839087996224 }, { "clip_ratio/high_max": 0.004102358419913799, "clip_ratio/high_mean": 0.004102358419913799, "clip_ratio/low_mean": 0.005123386712512001, "clip_ratio/low_min": 0.005123386712512001, "clip_ratio/region_mean": 0.009225745045114309, "completions/clipped_ratio": 0.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 1483.4375, "completions/mean_terminated_length": 1483.4375, "completions/min_length": 1422.0, "completions/min_terminated_length": 1422.0, "entropy": 0.04095361800864339, "epoch": 2.5360001014400042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028007538057863712, "kl": 0.01995270373299718, "learning_rate": 7.462979998024504e-06, "loss": 0.0001, "num_tokens": 14650464.0, "reward": -0.439617395401001, "reward_std": 14.014013290405273, "rewards/rollout_reward_func/mean": -0.439617395401001, "rewards/rollout_reward_func/std": 14.014013290405273, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.95705795288086, "sampling/sampling_logp_difference/mean": 0.2985129654407501, "step": 634, "step_time": 23.74986653799715 }, { "clip_ratio/high_max": 0.004446211911272258, "clip_ratio/high_mean": 0.004446211911272258, "clip_ratio/low_mean": 0.004460429685423151, "clip_ratio/low_min": 0.004460429685423151, "clip_ratio/region_mean": 0.008906641509383917, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 1353.25, "completions/mean_terminated_length": 1353.25, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.04077884694561362, "epoch": 2.540000101600004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030203128699213266, "kl": 0.02060991490725428, "learning_rate": 7.46297999801789e-06, "loss": 0.0001, "num_tokens": 14685008.0, "reward": 3.6734442710876465, "reward_std": 26.401262283325195, "rewards/rollout_reward_func/mean": 3.6734442710876465, "rewards/rollout_reward_func/std": 26.401262283325195, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.9375, "sampling/sampling_logp_difference/mean": 0.2874056100845337, "step": 635, "step_time": 23.022439845000918 }, { "clip_ratio/high_max": 0.004790350561961532, "clip_ratio/high_mean": 0.004790350561961532, "clip_ratio/low_mean": 0.004642666928702965, "clip_ratio/low_min": 0.004642666928702965, "clip_ratio/region_mean": 0.009433017519768327, "completions/clipped_ratio": 0.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 1450.9375, "completions/mean_terminated_length": 1450.9375, "completions/min_length": 1345.0, "completions/min_terminated_length": 1345.0, "entropy": 0.04027255019173026, "epoch": 2.544000101760004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033950384240597486, "kl": 0.020956107415258884, "learning_rate": 7.462979998011267e-06, "loss": 0.0001, "num_tokens": 14721132.0, "reward": -2.618067741394043, "reward_std": 4.662227630615234, "rewards/rollout_reward_func/mean": -2.618067741394043, "rewards/rollout_reward_func/std": 4.662228107452393, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.69921875, "sampling/sampling_logp_difference/mean": 0.29990488290786743, "step": 636, "step_time": 23.224547234007332 }, { "clip_ratio/high_max": 0.0047921554069034755, "clip_ratio/high_mean": 0.0047921554069034755, "clip_ratio/low_mean": 0.004446799663128331, "clip_ratio/low_min": 0.004446799663128331, "clip_ratio/region_mean": 0.009238955040927976, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 1439.125, "completions/mean_terminated_length": 1439.125, "completions/min_length": 1374.0, "completions/min_terminated_length": 1374.0, "entropy": 0.039942468982189894, "epoch": 2.548000101920004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004495986271649599, "kl": 0.021481085568666458, "learning_rate": 7.4629799980046325e-06, "loss": 0.0002, "num_tokens": 14757030.0, "reward": -3.314774751663208, "reward_std": 9.919927597045898, "rewards/rollout_reward_func/mean": -3.314774751663208, "rewards/rollout_reward_func/std": 9.919927597045898, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.66033935546875, "sampling/sampling_logp_difference/mean": 0.28719788789749146, "step": 637, "step_time": 23.27949662800529 }, { "clip_ratio/high_max": 0.0039456504746340215, "clip_ratio/high_mean": 0.0039456504746340215, "clip_ratio/low_mean": 0.00429106384399347, "clip_ratio/low_min": 0.00429106384399347, "clip_ratio/region_mean": 0.008236714289523661, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 1453.125, "completions/mean_terminated_length": 1453.125, "completions/min_length": 1319.0, "completions/min_terminated_length": 1319.0, "entropy": 0.04102363623678684, "epoch": 2.552000102080004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029171358328312635, "kl": 0.01861653453670442, "learning_rate": 7.462979997997987e-06, "loss": 0.0001, "num_tokens": 14793198.0, "reward": -1.4383487701416016, "reward_std": 17.89716339111328, "rewards/rollout_reward_func/mean": -1.4383487701416016, "rewards/rollout_reward_func/std": 17.89716339111328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.375, "sampling/sampling_logp_difference/mean": 0.29160887002944946, "step": 638, "step_time": 23.250269227006356 }, { "clip_ratio/high_max": 0.0029320500034373254, "clip_ratio/high_mean": 0.0029320500034373254, "clip_ratio/low_mean": 0.005039577459683642, "clip_ratio/low_min": 0.005039577459683642, "clip_ratio/region_mean": 0.007971627346705645, "completions/clipped_ratio": 0.0, "completions/max_length": 1544.0, "completions/max_terminated_length": 1544.0, "completions/mean_length": 1430.0625, "completions/mean_terminated_length": 1430.0625, "completions/min_length": 1357.0, "completions/min_terminated_length": 1357.0, "entropy": 0.04226940032094717, "epoch": 2.5560001022400042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.057191457599401474, "kl": 0.03513126540929079, "learning_rate": 7.46297999799133e-06, "loss": 0.0002, "num_tokens": 14828967.0, "reward": -1.622079610824585, "reward_std": 11.489084243774414, "rewards/rollout_reward_func/mean": -1.622079610824585, "rewards/rollout_reward_func/std": 11.48908519744873, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.25, "sampling/sampling_logp_difference/mean": 0.296810507774353, "step": 639, "step_time": 23.40487406199827 }, { "clip_ratio/high_max": 0.00393569830339402, "clip_ratio/high_mean": 0.00393569830339402, "clip_ratio/low_mean": 0.00492844294058159, "clip_ratio/low_min": 0.00492844294058159, "clip_ratio/region_mean": 0.00886414124397561, "completions/clipped_ratio": 0.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 1452.6875, "completions/mean_terminated_length": 1452.6875, "completions/min_length": 1355.0, "completions/min_terminated_length": 1355.0, "entropy": 0.040539424400776625, "epoch": 2.560000102400004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003314268309623003, "kl": 0.02280592219904065, "learning_rate": 7.462979997984662e-06, "loss": 0.0002, "num_tokens": 14865123.0, "reward": 1.3476483821868896, "reward_std": 10.964186668395996, "rewards/rollout_reward_func/mean": 1.3476483821868896, "rewards/rollout_reward_func/std": 10.964186668395996, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.625, "sampling/sampling_logp_difference/mean": 0.28450247645378113, "step": 640, "step_time": 23.58662873400317 }, { "clip_ratio/high_max": 0.004254053521435708, "clip_ratio/high_mean": 0.004254053521435708, "clip_ratio/low_mean": 0.005408274359069765, "clip_ratio/low_min": 0.005408274359069765, "clip_ratio/region_mean": 0.009662327880505472, "completions/clipped_ratio": 0.0, "completions/max_length": 1864.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 1720.8125, "completions/mean_terminated_length": 1720.8125, "completions/min_length": 1398.0, "completions/min_terminated_length": 1398.0, "entropy": 0.0386901069432497, "epoch": 2.564000102560004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004145416896790266, "kl": 0.0205020175781101, "learning_rate": 7.462979997977983e-06, "loss": 0.0002, "num_tokens": 14905574.0, "reward": -2.500220775604248, "reward_std": 21.421585083007812, "rewards/rollout_reward_func/mean": -2.500220775604248, "rewards/rollout_reward_func/std": 21.421585083007812, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.71875, "sampling/sampling_logp_difference/mean": 0.28467124700546265, "step": 641, "step_time": 26.10893914299959 }, { "clip_ratio/high_max": 0.005285681982059032, "clip_ratio/high_mean": 0.005285681982059032, "clip_ratio/low_mean": 0.003192558535374701, "clip_ratio/low_min": 0.003192558535374701, "clip_ratio/region_mean": 0.008478240575641394, "completions/clipped_ratio": 0.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 1707.625, "completions/mean_terminated_length": 1707.625, "completions/min_length": 1582.0, "completions/min_terminated_length": 1582.0, "entropy": 0.038001451175659895, "epoch": 2.568000102720004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037813340313732624, "kl": 0.019535634899511933, "learning_rate": 7.462979997971293e-06, "loss": 0.0002, "num_tokens": 14945787.0, "reward": -2.4860634803771973, "reward_std": 9.650492668151855, "rewards/rollout_reward_func/mean": -2.4860634803771973, "rewards/rollout_reward_func/std": 9.650493621826172, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.0000114440918, "sampling/sampling_logp_difference/mean": 0.2684651017189026, "step": 642, "step_time": 26.321353523009748 }, { "clip_ratio/high_max": 0.004754377820063382, "clip_ratio/high_mean": 0.004754377820063382, "clip_ratio/low_mean": 0.004487090278416872, "clip_ratio/low_min": 0.004487090278416872, "clip_ratio/region_mean": 0.009241468098480254, "completions/clipped_ratio": 0.0, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 1741.5, "completions/mean_terminated_length": 1741.5, "completions/min_length": 1659.0, "completions/min_terminated_length": 1659.0, "entropy": 0.03744505671784282, "epoch": 2.5720001028800042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005653173662722111, "kl": 0.020709649892523885, "learning_rate": 7.462979997964592e-06, "loss": 0.0002, "num_tokens": 14986557.0, "reward": -5.176024436950684, "reward_std": 5.905522346496582, "rewards/rollout_reward_func/mean": -5.176024436950684, "rewards/rollout_reward_func/std": 5.90552282333374, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.0625, "sampling/sampling_logp_difference/mean": 0.2762643098831177, "step": 643, "step_time": 26.36653246901551 }, { "clip_ratio/high_max": 0.00547255331184715, "clip_ratio/high_mean": 0.00547255331184715, "clip_ratio/low_mean": 0.0035038172791246325, "clip_ratio/low_min": 0.0035038172791246325, "clip_ratio/region_mean": 0.008976370561867952, "completions/clipped_ratio": 0.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 1757.625, "completions/mean_terminated_length": 1757.625, "completions/min_length": 1445.0, "completions/min_terminated_length": 1445.0, "entropy": 0.037539555225521326, "epoch": 2.576000103040004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003494554664939642, "kl": 0.02120010752696544, "learning_rate": 7.4629799979578805e-06, "loss": 0.0002, "num_tokens": 15027593.0, "reward": 0.5774478912353516, "reward_std": 26.36042022705078, "rewards/rollout_reward_func/mean": 0.5774478912353516, "rewards/rollout_reward_func/std": 26.360422134399414, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.109375, "sampling/sampling_logp_difference/mean": 0.275745689868927, "step": 644, "step_time": 26.383491371991113 }, { "clip_ratio/high_max": 0.004561115871183574, "clip_ratio/high_mean": 0.004561115871183574, "clip_ratio/low_mean": 0.004081357561517507, "clip_ratio/low_min": 0.004081357561517507, "clip_ratio/region_mean": 0.008642473432701081, "completions/clipped_ratio": 0.0, "completions/max_length": 1869.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 1669.75, "completions/mean_terminated_length": 1669.75, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.036977025447413325, "epoch": 2.580000103200004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033401907421648502, "kl": 0.021770434686914086, "learning_rate": 7.4629799979511576e-06, "loss": 0.0002, "num_tokens": 15067226.0, "reward": -3.0174474716186523, "reward_std": 27.254871368408203, "rewards/rollout_reward_func/mean": -3.0174474716186523, "rewards/rollout_reward_func/std": 27.25486946105957, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.84375, "sampling/sampling_logp_difference/mean": 0.2765793204307556, "step": 645, "step_time": 25.63573585300037 }, { "clip_ratio/high_max": 0.0035847059916704893, "clip_ratio/high_mean": 0.0035847059916704893, "clip_ratio/low_mean": 0.0035654747916851193, "clip_ratio/low_min": 0.0035654747916851193, "clip_ratio/region_mean": 0.007150180812459439, "completions/clipped_ratio": 0.0, "completions/max_length": 1849.0, "completions/max_terminated_length": 1849.0, "completions/mean_length": 1783.125, "completions/mean_terminated_length": 1783.125, "completions/min_length": 1654.0, "completions/min_terminated_length": 1654.0, "entropy": 0.0384394358843565, "epoch": 2.584000103360004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007262226194143295, "kl": 0.02097374526783824, "learning_rate": 7.462979997944424e-06, "loss": 0.0002, "num_tokens": 15108687.0, "reward": -5.809347629547119, "reward_std": 8.55140209197998, "rewards/rollout_reward_func/mean": -5.809347629547119, "rewards/rollout_reward_func/std": 8.551403045654297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.5, "sampling/sampling_logp_difference/mean": 0.27784961462020874, "step": 646, "step_time": 26.29175966398907 }, { "clip_ratio/high_max": 0.004506417317315936, "clip_ratio/high_mean": 0.004506417317315936, "clip_ratio/low_mean": 0.0031894789717625827, "clip_ratio/low_min": 0.0031894789717625827, "clip_ratio/region_mean": 0.007695896318182349, "completions/clipped_ratio": 0.0, "completions/max_length": 1893.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 1736.4375, "completions/mean_terminated_length": 1736.4375, "completions/min_length": 1658.0, "completions/min_terminated_length": 1658.0, "entropy": 0.03694075392559171, "epoch": 2.5880001035200042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030863520223647356, "kl": 0.018194671953096986, "learning_rate": 7.462979997937679e-06, "loss": 0.0002, "num_tokens": 15149376.0, "reward": -3.722909450531006, "reward_std": 7.6642866134643555, "rewards/rollout_reward_func/mean": -3.722909450531006, "rewards/rollout_reward_func/std": 7.664287090301514, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.84375, "sampling/sampling_logp_difference/mean": 0.2701217830181122, "step": 647, "step_time": 26.53884535200632 }, { "clip_ratio/high_max": 0.005457283841678873, "clip_ratio/high_mean": 0.005457283841678873, "clip_ratio/low_mean": 0.003466622409177944, "clip_ratio/low_min": 0.003466622409177944, "clip_ratio/region_mean": 0.008923906250856817, "completions/clipped_ratio": 0.0, "completions/max_length": 1868.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 1754.6875, "completions/mean_terminated_length": 1754.6875, "completions/min_length": 1629.0, "completions/min_terminated_length": 1629.0, "entropy": 0.03503948240540922, "epoch": 2.5920001036800043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010875414125621319, "kl": 0.021185687161050737, "learning_rate": 7.462979997930923e-06, "loss": 0.0002, "num_tokens": 15190353.0, "reward": -9.691915512084961, "reward_std": 5.774526119232178, "rewards/rollout_reward_func/mean": -9.691915512084961, "rewards/rollout_reward_func/std": 5.774526596069336, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.5078125, "sampling/sampling_logp_difference/mean": 0.27672621607780457, "step": 648, "step_time": 26.175869640996098 }, { "clip_ratio/high_max": 0.0044353244884405285, "clip_ratio/high_mean": 0.0044353244884405285, "clip_ratio/low_mean": 0.0036241902271285653, "clip_ratio/low_min": 0.0036241902271285653, "clip_ratio/region_mean": 0.008059514744672924, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 1759.6875, "completions/mean_terminated_length": 1759.6875, "completions/min_length": 1639.0, "completions/min_terminated_length": 1639.0, "entropy": 0.03606190858408809, "epoch": 2.596000103840004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.020623158663511276, "kl": 0.02350360550917685, "learning_rate": 7.462979997924155e-06, "loss": 0.0002, "num_tokens": 15231426.0, "reward": -2.773650884628296, "reward_std": 8.389162063598633, "rewards/rollout_reward_func/mean": -2.773650884628296, "rewards/rollout_reward_func/std": 8.389162063598633, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.65625, "sampling/sampling_logp_difference/mean": 0.27896103262901306, "step": 649, "step_time": 26.379392761999043 }, { "clip_ratio/high_max": 0.004288274474674836, "clip_ratio/high_mean": 0.004288274474674836, "clip_ratio/low_mean": 0.0034055671421810985, "clip_ratio/low_min": 0.0034055671421810985, "clip_ratio/region_mean": 0.007693841587752104, "completions/clipped_ratio": 0.0, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 1647.4375, "completions/mean_terminated_length": 1647.4375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.03663186077028513, "epoch": 2.600000104000004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006358327344059944, "kl": 0.01726291689556092, "learning_rate": 7.462979997917376e-06, "loss": 0.0001, "num_tokens": 15270693.0, "reward": -2.3938474655151367, "reward_std": 27.769086837768555, "rewards/rollout_reward_func/mean": -2.3938474655151367, "rewards/rollout_reward_func/std": 27.769086837768555, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.859375, "sampling/sampling_logp_difference/mean": 0.2716081142425537, "step": 650, "step_time": 25.518990510005096 }, { "clip_ratio/high_max": 0.004886298760538921, "clip_ratio/high_mean": 0.004886298760538921, "clip_ratio/low_mean": 0.0034362156002316624, "clip_ratio/low_min": 0.0034362156002316624, "clip_ratio/region_mean": 0.008322514360770583, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 1810.125, "completions/mean_terminated_length": 1810.125, "completions/min_length": 1710.0, "completions/min_terminated_length": 1710.0, "entropy": 0.03458842122927308, "epoch": 2.6040001041600042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005663163494318724, "kl": 0.018641653936356306, "learning_rate": 7.462979997910587e-06, "loss": 0.0002, "num_tokens": 15312615.0, "reward": -8.184697151184082, "reward_std": 9.923417091369629, "rewards/rollout_reward_func/mean": -8.184697151184082, "rewards/rollout_reward_func/std": 9.923417091369629, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.7578125, "sampling/sampling_logp_difference/mean": 0.2673054039478302, "step": 651, "step_time": 25.904713098992943 }, { "clip_ratio/high_max": 0.004694091796409339, "clip_ratio/high_mean": 0.004694091796409339, "clip_ratio/low_mean": 0.003973466198658571, "clip_ratio/low_min": 0.003973466198658571, "clip_ratio/region_mean": 0.00866755802417174, "completions/clipped_ratio": 0.0, "completions/max_length": 1877.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 1687.6875, "completions/mean_terminated_length": 1687.6875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.03505092766135931, "epoch": 2.6080001043200043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036305661778897047, "kl": 0.01603234198410064, "learning_rate": 7.462979997903786e-06, "loss": 0.0001, "num_tokens": 15352553.0, "reward": -1.2814490795135498, "reward_std": 25.863691329956055, "rewards/rollout_reward_func/mean": -1.2814490795135498, "rewards/rollout_reward_func/std": 25.863689422607422, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.28125, "sampling/sampling_logp_difference/mean": 0.2710849344730377, "step": 652, "step_time": 25.831573616997048 }, { "clip_ratio/high_max": 0.004519909591181204, "clip_ratio/high_mean": 0.004519909591181204, "clip_ratio/low_mean": 0.0044782349723391235, "clip_ratio/low_min": 0.0044782349723391235, "clip_ratio/region_mean": 0.008998144534416497, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 1680.4375, "completions/mean_terminated_length": 1680.4375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.03672415483742952, "epoch": 2.612000104480004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003595277201384306, "kl": 0.01953432464506477, "learning_rate": 7.462979997896975e-06, "loss": 0.0002, "num_tokens": 15392356.0, "reward": -5.104970455169678, "reward_std": 23.37249755859375, "rewards/rollout_reward_func/mean": -5.104970455169678, "rewards/rollout_reward_func/std": 23.372499465942383, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.46875, "sampling/sampling_logp_difference/mean": 0.27131298184394836, "step": 653, "step_time": 26.036050596005225 }, { "clip_ratio/high_max": 0.0028297552780713886, "clip_ratio/high_mean": 0.0028297552780713886, "clip_ratio/low_mean": 0.005305197322741151, "clip_ratio/low_min": 0.005305197322741151, "clip_ratio/region_mean": 0.00813495262991637, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1662.625, "completions/mean_terminated_length": 1662.625, "completions/min_length": 1030.0, "completions/min_terminated_length": 1030.0, "entropy": 0.03894874360412359, "epoch": 2.616000104640004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005459875334054232, "kl": 0.017581417225301266, "learning_rate": 7.462979997890153e-06, "loss": 0.0001, "num_tokens": 15431829.0, "reward": 2.074522018432617, "reward_std": 35.78578186035156, "rewards/rollout_reward_func/mean": 2.074522018432617, "rewards/rollout_reward_func/std": 35.78578186035156, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.9140739440918, "sampling/sampling_logp_difference/mean": 0.29021719098091125, "step": 654, "step_time": 25.69083040201076 }, { "clip_ratio/high_max": 0.004404324135975912, "clip_ratio/high_mean": 0.004404324135975912, "clip_ratio/low_mean": 0.00377278178348206, "clip_ratio/low_min": 0.00377278178348206, "clip_ratio/region_mean": 0.008177105919457972, "completions/clipped_ratio": 0.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1788.0625, "completions/mean_terminated_length": 1788.0625, "completions/min_length": 1700.0, "completions/min_terminated_length": 1700.0, "entropy": 0.03623384074307978, "epoch": 2.6200001048000042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004955986514687538, "kl": 0.018264729529619217, "learning_rate": 7.4629799978833195e-06, "loss": 0.0002, "num_tokens": 15473367.0, "reward": -7.780479431152344, "reward_std": 14.15961742401123, "rewards/rollout_reward_func/mean": -7.780479431152344, "rewards/rollout_reward_func/std": 14.15961742401123, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.609375, "sampling/sampling_logp_difference/mean": 0.2724405527114868, "step": 655, "step_time": 26.031569526996464 }, { "clip_ratio/high_max": 0.0030510798023897223, "clip_ratio/high_mean": 0.0030510798023897223, "clip_ratio/low_mean": 0.005017350427806377, "clip_ratio/low_min": 0.005017350427806377, "clip_ratio/region_mean": 0.008068430237472057, "completions/clipped_ratio": 0.0, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 1673.375, "completions/mean_terminated_length": 1673.375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.038616384379565716, "epoch": 2.6240001049600043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005704166367650032, "kl": 0.020041804178617895, "learning_rate": 7.4629799978764755e-06, "loss": 0.0002, "num_tokens": 15513079.0, "reward": 13.658288955688477, "reward_std": 28.35468864440918, "rewards/rollout_reward_func/mean": 13.658288955688477, "rewards/rollout_reward_func/std": 28.35468864440918, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.875, "sampling/sampling_logp_difference/mean": 0.2973865270614624, "step": 656, "step_time": 25.923072426994622 }, { "clip_ratio/high_max": 0.004529598067165352, "clip_ratio/high_mean": 0.004529598067165352, "clip_ratio/low_mean": 0.0034744938602671027, "clip_ratio/low_min": 0.0034744938602671027, "clip_ratio/region_mean": 0.008004092087503523, "completions/clipped_ratio": 0.0, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 1747.9375, "completions/mean_terminated_length": 1747.9375, "completions/min_length": 1628.0, "completions/min_terminated_length": 1628.0, "entropy": 0.034152280539274216, "epoch": 2.6280001051200043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01610158383846283, "kl": 0.02443368744570762, "learning_rate": 7.46297999786962e-06, "loss": 0.0002, "num_tokens": 15553953.0, "reward": -5.3762712478637695, "reward_std": 11.536758422851562, "rewards/rollout_reward_func/mean": -5.3762712478637695, "rewards/rollout_reward_func/std": 11.536759376525879, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.65625762939453, "sampling/sampling_logp_difference/mean": 0.2953740954399109, "step": 657, "step_time": 26.219660130998818 }, { "clip_ratio/high_max": 0.0038415679009631276, "clip_ratio/high_mean": 0.0038415679009631276, "clip_ratio/low_mean": 0.004574790946207941, "clip_ratio/low_min": 0.004574790946207941, "clip_ratio/region_mean": 0.008416358847171068, "completions/clipped_ratio": 0.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 1698.0, "completions/mean_terminated_length": 1698.0, "completions/min_length": 1474.0, "completions/min_terminated_length": 1474.0, "entropy": 0.038761265110224485, "epoch": 2.632000105280004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035361412446945906, "kl": 0.019343013525940478, "learning_rate": 7.462979997862753e-06, "loss": 0.0002, "num_tokens": 15593996.0, "reward": -3.6318166255950928, "reward_std": 17.273000717163086, "rewards/rollout_reward_func/mean": -3.6318166255950928, "rewards/rollout_reward_func/std": 17.27300262451172, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.9375114440918, "sampling/sampling_logp_difference/mean": 0.2990087866783142, "step": 658, "step_time": 26.210125548001088 }, { "clip_ratio/high_max": 0.003886059217620641, "clip_ratio/high_mean": 0.003886059217620641, "clip_ratio/low_mean": 0.004364179214462638, "clip_ratio/low_min": 0.004364179214462638, "clip_ratio/region_mean": 0.008250238432083279, "completions/clipped_ratio": 0.0, "completions/max_length": 1869.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 1797.3125, "completions/mean_terminated_length": 1797.3125, "completions/min_length": 1696.0, "completions/min_terminated_length": 1696.0, "entropy": 0.03320428868755698, "epoch": 2.636000105440004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0041645849123597145, "kl": 0.018865706166252494, "learning_rate": 7.462979997855876e-06, "loss": 0.0002, "num_tokens": 15635695.0, "reward": -8.419122695922852, "reward_std": 6.674655437469482, "rewards/rollout_reward_func/mean": -8.419122695922852, "rewards/rollout_reward_func/std": 6.674655914306641, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.84770202636719, "sampling/sampling_logp_difference/mean": 0.276877224445343, "step": 659, "step_time": 26.093983960003243 }, { "clip_ratio/high_max": 0.004536445369012654, "clip_ratio/high_mean": 0.004536445369012654, "clip_ratio/low_mean": 0.0038440049393102527, "clip_ratio/low_min": 0.0038440049393102527, "clip_ratio/region_mean": 0.008380450308322906, "completions/clipped_ratio": 0.0, "completions/max_length": 1843.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 1759.5625, "completions/mean_terminated_length": 1759.5625, "completions/min_length": 1642.0, "completions/min_terminated_length": 1642.0, "entropy": 0.03633040655404329, "epoch": 2.6400001056000042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005856228992342949, "kl": 0.01875034614931792, "learning_rate": 7.462979997848987e-06, "loss": 0.0002, "num_tokens": 15676760.0, "reward": -5.689760208129883, "reward_std": 9.082539558410645, "rewards/rollout_reward_func/mean": -5.689760208129883, "rewards/rollout_reward_func/std": 9.082540512084961, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.203125, "sampling/sampling_logp_difference/mean": 0.2751113474369049, "step": 660, "step_time": 26.37352503900911 }, { "clip_ratio/high_max": 0.002925378270447254, "clip_ratio/high_mean": 0.002925378270447254, "clip_ratio/low_mean": 0.005670704762451351, "clip_ratio/low_min": 0.005670704762451351, "clip_ratio/region_mean": 0.008596082974690944, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 1729.1875, "completions/mean_terminated_length": 1729.1875, "completions/min_length": 1469.0, "completions/min_terminated_length": 1469.0, "entropy": 0.036660210229456425, "epoch": 2.6440001057600043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032155835069715977, "kl": 0.020027526887133718, "learning_rate": 7.462979997842086e-06, "loss": 0.0002, "num_tokens": 15717347.0, "reward": -0.8371981382369995, "reward_std": 18.25175666809082, "rewards/rollout_reward_func/mean": -0.8371981382369995, "rewards/rollout_reward_func/std": 18.25175666809082, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.687503814697266, "sampling/sampling_logp_difference/mean": 0.2880297005176544, "step": 661, "step_time": 26.19927197900688 }, { "clip_ratio/high_max": 0.005444745271233842, "clip_ratio/high_mean": 0.005444745271233842, "clip_ratio/low_mean": 0.0028142210212536156, "clip_ratio/low_min": 0.0028142210212536156, "clip_ratio/region_mean": 0.008258966321591288, "completions/clipped_ratio": 0.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 1776.5, "completions/mean_terminated_length": 1776.5, "completions/min_length": 1658.0, "completions/min_terminated_length": 1658.0, "entropy": 0.035414361860603094, "epoch": 2.648000105920004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00456455210223794, "kl": 0.01726543006952852, "learning_rate": 7.462979997835176e-06, "loss": 0.0001, "num_tokens": 15758699.0, "reward": -12.166976928710938, "reward_std": 9.141226768493652, "rewards/rollout_reward_func/mean": -12.166976928710938, "rewards/rollout_reward_func/std": 9.141226768493652, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.0625, "sampling/sampling_logp_difference/mean": 0.2727993130683899, "step": 662, "step_time": 26.258648838993395 }, { "clip_ratio/high_max": 0.003032742824871093, "clip_ratio/high_mean": 0.003032742824871093, "clip_ratio/low_mean": 0.005809349939227104, "clip_ratio/low_min": 0.005809349939227104, "clip_ratio/region_mean": 0.008842092764098197, "completions/clipped_ratio": 0.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 1623.5625, "completions/mean_terminated_length": 1623.5625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.03944932157173753, "epoch": 2.652000106080004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028072018176317215, "kl": 0.016857739305123687, "learning_rate": 7.462979997828254e-06, "loss": 0.0001, "num_tokens": 15797572.0, "reward": 2.612417221069336, "reward_std": 27.204782485961914, "rewards/rollout_reward_func/mean": 2.612417221069336, "rewards/rollout_reward_func/std": 27.204782485961914, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.59375762939453, "sampling/sampling_logp_difference/mean": 0.2957402467727661, "step": 663, "step_time": 25.545906604013 }, { "clip_ratio/high_max": 0.00477127026533708, "clip_ratio/high_mean": 0.00477127026533708, "clip_ratio/low_mean": 0.003128142358036712, "clip_ratio/low_min": 0.003128142358036712, "clip_ratio/region_mean": 0.007899412594269961, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 1709.75, "completions/mean_terminated_length": 1709.75, "completions/min_length": 1619.0, "completions/min_terminated_length": 1619.0, "entropy": 0.03759249811992049, "epoch": 2.6560001062400042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002391538117080927, "kl": 0.017024509143084288, "learning_rate": 7.462979997821321e-06, "loss": 0.0001, "num_tokens": 15837816.0, "reward": -6.221149921417236, "reward_std": 9.687285423278809, "rewards/rollout_reward_func/mean": -6.221149921417236, "rewards/rollout_reward_func/std": 9.687286376953125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.64849853515625, "sampling/sampling_logp_difference/mean": 0.2803975045681, "step": 664, "step_time": 26.371656036004424 }, { "clip_ratio/high_max": 0.004029166419059038, "clip_ratio/high_mean": 0.004029166419059038, "clip_ratio/low_mean": 0.005155529273906723, "clip_ratio/low_min": 0.005155529273906723, "clip_ratio/region_mean": 0.00918469560565427, "completions/clipped_ratio": 0.0, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 1764.375, "completions/mean_terminated_length": 1764.375, "completions/min_length": 1431.0, "completions/min_terminated_length": 1431.0, "entropy": 0.03510359115898609, "epoch": 2.6600001064000043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00886363536119461, "kl": 0.019375333446078002, "learning_rate": 7.462979997814377e-06, "loss": 0.0002, "num_tokens": 15878979.0, "reward": -2.2411770820617676, "reward_std": 20.16114044189453, "rewards/rollout_reward_func/mean": -2.2411770820617676, "rewards/rollout_reward_func/std": 20.161142349243164, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.203125, "sampling/sampling_logp_difference/mean": 0.29994621872901917, "step": 665, "step_time": 26.400065381996683 }, { "clip_ratio/high_max": 0.00378568161977455, "clip_ratio/high_mean": 0.00378568161977455, "clip_ratio/low_mean": 0.004543166258372366, "clip_ratio/low_min": 0.004543166258372366, "clip_ratio/region_mean": 0.008328847878146917, "completions/clipped_ratio": 0.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1711.625, "completions/mean_terminated_length": 1711.625, "completions/min_length": 1618.0, "completions/min_terminated_length": 1618.0, "entropy": 0.03657418116927147, "epoch": 2.6640001065600044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005151655059307814, "kl": 0.017419694806449115, "learning_rate": 7.462979997807422e-06, "loss": 0.0001, "num_tokens": 15919261.0, "reward": -5.223353862762451, "reward_std": 10.378220558166504, "rewards/rollout_reward_func/mean": -5.223353862762451, "rewards/rollout_reward_func/std": 10.37822151184082, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.08984375, "sampling/sampling_logp_difference/mean": 0.2898256182670593, "step": 666, "step_time": 25.931341971998336 }, { "clip_ratio/high_max": 0.003917160531273112, "clip_ratio/high_mean": 0.003917160531273112, "clip_ratio/low_mean": 0.004829726472962648, "clip_ratio/low_min": 0.004829726472962648, "clip_ratio/region_mean": 0.008746886916924268, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 1705.5, "completions/mean_terminated_length": 1705.5, "completions/min_length": 1540.0, "completions/min_terminated_length": 1540.0, "entropy": 0.03632284188643098, "epoch": 2.668000106720004e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003992987331002951, "kl": 0.016606143675744534, "learning_rate": 7.462979997800456e-06, "loss": 0.0001, "num_tokens": 15959427.0, "reward": -0.7594025731086731, "reward_std": 15.997657775878906, "rewards/rollout_reward_func/mean": -0.7594025731086731, "rewards/rollout_reward_func/std": 15.997658729553223, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.859375, "sampling/sampling_logp_difference/mean": 0.29579177498817444, "step": 667, "step_time": 25.82378959000198 }, { "clip_ratio/high_max": 0.0032666308688931167, "clip_ratio/high_mean": 0.0032666308688931167, "clip_ratio/low_mean": 0.0043180701904930174, "clip_ratio/low_min": 0.0043180701904930174, "clip_ratio/region_mean": 0.007584701001178473, "completions/clipped_ratio": 0.0, "completions/max_length": 1873.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 1804.125, "completions/mean_terminated_length": 1804.125, "completions/min_length": 1736.0, "completions/min_terminated_length": 1736.0, "entropy": 0.03521374287083745, "epoch": 2.6720001068800042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.027066199108958244, "kl": 0.022963459021411836, "learning_rate": 7.462979997793479e-06, "loss": 0.0002, "num_tokens": 16001248.0, "reward": -1.5157766342163086, "reward_std": 14.016838073730469, "rewards/rollout_reward_func/mean": -1.5157766342163086, "rewards/rollout_reward_func/std": 14.016838073730469, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.59375, "sampling/sampling_logp_difference/mean": 0.2709789276123047, "step": 668, "step_time": 26.0326166649902 }, { "clip_ratio/high_max": 0.003927598096197471, "clip_ratio/high_mean": 0.003927598096197471, "clip_ratio/low_mean": 0.004662774270400405, "clip_ratio/low_min": 0.004662774270400405, "clip_ratio/region_mean": 0.008590372337494045, "completions/clipped_ratio": 0.0, "completions/max_length": 1833.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 1738.625, "completions/mean_terminated_length": 1738.625, "completions/min_length": 1470.0, "completions/min_terminated_length": 1470.0, "entropy": 0.036338281352072954, "epoch": 2.6760001070400043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01566155068576336, "kl": 0.02285916416440159, "learning_rate": 7.462979997786491e-06, "loss": 0.0002, "num_tokens": 16041978.0, "reward": 4.711921691894531, "reward_std": 30.304941177368164, "rewards/rollout_reward_func/mean": 4.711921691894531, "rewards/rollout_reward_func/std": 30.304941177368164, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.690948486328125, "sampling/sampling_logp_difference/mean": 0.29264530539512634, "step": 669, "step_time": 26.15070393800852 }, { "clip_ratio/high_max": 0.004007723677204922, "clip_ratio/high_mean": 0.004007723677204922, "clip_ratio/low_mean": 0.003950056649046019, "clip_ratio/low_min": 0.003950056649046019, "clip_ratio/region_mean": 0.00795778032625094, "completions/clipped_ratio": 0.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 1766.125, "completions/mean_terminated_length": 1766.125, "completions/min_length": 1635.0, "completions/min_terminated_length": 1635.0, "entropy": 0.03592226933687925, "epoch": 2.6800001072000044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012233809567987919, "kl": 0.020750620518811047, "learning_rate": 7.462979997779491e-06, "loss": 0.0002, "num_tokens": 16083161.0, "reward": -8.336812973022461, "reward_std": 6.303380966186523, "rewards/rollout_reward_func/mean": -8.336812973022461, "rewards/rollout_reward_func/std": 6.303381443023682, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.34419250488281, "sampling/sampling_logp_difference/mean": 0.27806466817855835, "step": 670, "step_time": 26.330916631988657 }, { "clip_ratio/high_max": 0.004433014226378873, "clip_ratio/high_mean": 0.004433014226378873, "clip_ratio/low_mean": 0.00397946746670641, "clip_ratio/low_min": 0.00397946746670641, "clip_ratio/region_mean": 0.008412481751292944, "completions/clipped_ratio": 0.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 1783.0625, "completions/mean_terminated_length": 1783.0625, "completions/min_length": 1661.0, "completions/min_terminated_length": 1661.0, "entropy": 0.03535575373098254, "epoch": 2.6840001073600045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032964081037789583, "kl": 0.018022072850726545, "learning_rate": 7.462979997772482e-06, "loss": 0.0002, "num_tokens": 16124622.0, "reward": -9.704182624816895, "reward_std": 8.121076583862305, "rewards/rollout_reward_func/mean": -9.704182624816895, "rewards/rollout_reward_func/std": 8.121076583862305, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.125, "sampling/sampling_logp_difference/mean": 0.28411322832107544, "step": 671, "step_time": 26.248343913997815 }, { "clip_ratio/high_max": 0.003625501471105963, "clip_ratio/high_mean": 0.003625501471105963, "clip_ratio/low_mean": 0.004479589260881767, "clip_ratio/low_min": 0.004479589260881767, "clip_ratio/region_mean": 0.008105090644676238, "completions/clipped_ratio": 0.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 1660.1875, "completions/mean_terminated_length": 1660.1875, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "entropy": 0.03769947309046984, "epoch": 2.6880001075200042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029447684064507484, "kl": 0.017057000775821507, "learning_rate": 7.462979997765459e-06, "loss": 0.0001, "num_tokens": 16164038.0, "reward": 0.12643766403198242, "reward_std": 27.622194290161133, "rewards/rollout_reward_func/mean": 0.12643766403198242, "rewards/rollout_reward_func/std": 27.6221981048584, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.90625, "sampling/sampling_logp_difference/mean": 0.2872745990753174, "step": 672, "step_time": 26.1882895299932 }, { "clip_ratio/high_max": 0.0035517557698767632, "clip_ratio/high_mean": 0.0035517557698767632, "clip_ratio/low_mean": 0.005551190639380366, "clip_ratio/low_min": 0.005551190639380366, "clip_ratio/region_mean": 0.00910294649656862, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 1677.6875, "completions/mean_terminated_length": 1677.6875, "completions/min_length": 1620.0, "completions/min_terminated_length": 1620.0, "entropy": 0.03739431593567133, "epoch": 2.6920001076800043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005498192273080349, "kl": 0.020014676498249173, "learning_rate": 7.462979997758428e-06, "loss": 0.0002, "num_tokens": 16203727.0, "reward": -5.061062812805176, "reward_std": 7.4538984298706055, "rewards/rollout_reward_func/mean": -5.061062812805176, "rewards/rollout_reward_func/std": 7.4538984298706055, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.242271423339844, "sampling/sampling_logp_difference/mean": 0.2930592894554138, "step": 673, "step_time": 25.531312508996052 }, { "clip_ratio/high_max": 0.004455610265722498, "clip_ratio/high_mean": 0.004455610265722498, "clip_ratio/low_mean": 0.0035678994026966393, "clip_ratio/low_min": 0.0035678994026966393, "clip_ratio/region_mean": 0.008023509755730629, "completions/clipped_ratio": 0.0, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 1613.1875, "completions/mean_terminated_length": 1613.1875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.03977016778662801, "epoch": 2.6960001078400044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002611737698316574, "kl": 0.016649776371195912, "learning_rate": 7.462979997751384e-06, "loss": 0.0001, "num_tokens": 16242422.0, "reward": 7.902669906616211, "reward_std": 35.89565658569336, "rewards/rollout_reward_func/mean": 7.902669906616211, "rewards/rollout_reward_func/std": 35.89565658569336, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.218753814697266, "sampling/sampling_logp_difference/mean": 0.2873777151107788, "step": 674, "step_time": 25.107763303989486 }, { "clip_ratio/high_max": 0.003931797822588123, "clip_ratio/high_mean": 0.003931797822588123, "clip_ratio/low_mean": 0.0031964963272912428, "clip_ratio/low_min": 0.0031964963272912428, "clip_ratio/region_mean": 0.007128294120775536, "completions/clipped_ratio": 0.0, "completions/max_length": 1877.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 1768.5625, "completions/mean_terminated_length": 1768.5625, "completions/min_length": 1668.0, "completions/min_terminated_length": 1668.0, "entropy": 0.03543979488313198, "epoch": 2.7000001080000044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011766048148274422, "kl": 0.01628429431002587, "learning_rate": 7.46297999774433e-06, "loss": 0.0001, "num_tokens": 16283645.0, "reward": -7.8513946533203125, "reward_std": 9.7378568649292, "rewards/rollout_reward_func/mean": -7.8513946533203125, "rewards/rollout_reward_func/std": 9.737857818603516, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.46875, "sampling/sampling_logp_difference/mean": 0.2769578695297241, "step": 675, "step_time": 26.19215564001206 }, { "clip_ratio/high_max": 0.0032112195913214236, "clip_ratio/high_mean": 0.0032112195913214236, "clip_ratio/low_mean": 0.005164481408428401, "clip_ratio/low_min": 0.005164481408428401, "clip_ratio/region_mean": 0.008375700970645994, "completions/clipped_ratio": 0.0, "completions/max_length": 1831.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 1630.9375, "completions/mean_terminated_length": 1630.9375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.037988423835486174, "epoch": 2.7040001081600042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00254653743468225, "kl": 0.016085574636235833, "learning_rate": 7.462979997737265e-06, "loss": 0.0001, "num_tokens": 16322638.0, "reward": -0.0396425724029541, "reward_std": 27.65137481689453, "rewards/rollout_reward_func/mean": -0.0396425724029541, "rewards/rollout_reward_func/std": 27.65137481689453, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.5, "sampling/sampling_logp_difference/mean": 0.2823973000049591, "step": 676, "step_time": 25.537273637004546 }, { "clip_ratio/high_max": 0.005304518504999578, "clip_ratio/high_mean": 0.005304518504999578, "clip_ratio/low_mean": 0.0034713338973233476, "clip_ratio/low_min": 0.0034713338973233476, "clip_ratio/region_mean": 0.00877585232956335, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 1740.125, "completions/mean_terminated_length": 1740.125, "completions/min_length": 1611.0, "completions/min_terminated_length": 1611.0, "entropy": 0.03726454824209213, "epoch": 2.7080001083200043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006321463733911514, "kl": 0.01728371437638998, "learning_rate": 7.462979997730187e-06, "loss": 0.0001, "num_tokens": 16363374.0, "reward": -5.897495269775391, "reward_std": 9.53194808959961, "rewards/rollout_reward_func/mean": -5.897495269775391, "rewards/rollout_reward_func/std": 9.53194808959961, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.296875, "sampling/sampling_logp_difference/mean": 0.2745940089225769, "step": 677, "step_time": 26.032313715018972 }, { "clip_ratio/high_max": 0.005917823902564123, "clip_ratio/high_mean": 0.005917823902564123, "clip_ratio/low_mean": 0.00359563939855434, "clip_ratio/low_min": 0.00359563939855434, "clip_ratio/region_mean": 0.009513463359326124, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 1676.125, "completions/mean_terminated_length": 1676.125, "completions/min_length": 1625.0, "completions/min_terminated_length": 1625.0, "entropy": 0.03733931458555162, "epoch": 2.7120001084800043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008273166604340076, "kl": 0.018533152993768454, "learning_rate": 7.4629799977231e-06, "loss": 0.0002, "num_tokens": 16403033.0, "reward": -6.774015426635742, "reward_std": 7.380300521850586, "rewards/rollout_reward_func/mean": -6.774015426635742, "rewards/rollout_reward_func/std": 7.380300521850586, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.34375, "sampling/sampling_logp_difference/mean": 0.2864660322666168, "step": 678, "step_time": 25.351249623010517 }, { "clip_ratio/high_max": 0.003577514027711004, "clip_ratio/high_mean": 0.003577514027711004, "clip_ratio/low_mean": 0.005547372245928273, "clip_ratio/low_min": 0.005547372245928273, "clip_ratio/region_mean": 0.009124886128120124, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 1646.1875, "completions/mean_terminated_length": 1646.1875, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "entropy": 0.04131248965859413, "epoch": 2.7160001086400044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004342330619692802, "kl": 0.01960854255594313, "learning_rate": 7.462979997716001e-06, "loss": 0.0002, "num_tokens": 16442238.0, "reward": 2.4170002937316895, "reward_std": 30.010326385498047, "rewards/rollout_reward_func/mean": 2.4170002937316895, "rewards/rollout_reward_func/std": 30.01032829284668, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.5625, "sampling/sampling_logp_difference/mean": 0.3027668595314026, "step": 679, "step_time": 25.460150917002466 }, { "clip_ratio/high_max": 0.004728282307041809, "clip_ratio/high_mean": 0.004728282307041809, "clip_ratio/low_mean": 0.0035205790190957487, "clip_ratio/low_min": 0.0035205790190957487, "clip_ratio/region_mean": 0.008248861238826066, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1782.4375, "completions/mean_terminated_length": 1782.4375, "completions/min_length": 1700.0, "completions/min_terminated_length": 1700.0, "entropy": 0.03414291236549616, "epoch": 2.7200001088000045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027392113115638494, "kl": 0.01689849590184167, "learning_rate": 7.462979997708892e-06, "loss": 0.0001, "num_tokens": 16483688.0, "reward": -6.141778469085693, "reward_std": 9.040030479431152, "rewards/rollout_reward_func/mean": -6.141778469085693, "rewards/rollout_reward_func/std": 9.040030479431152, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.5625, "sampling/sampling_logp_difference/mean": 0.27542644739151, "step": 680, "step_time": 26.003648482001154 }, { "clip_ratio/high_max": 0.005078767484519631, "clip_ratio/high_mean": 0.005078767484519631, "clip_ratio/low_mean": 0.003572047542547807, "clip_ratio/low_min": 0.003572047542547807, "clip_ratio/region_mean": 0.008650814939755946, "completions/clipped_ratio": 0.0, "completions/max_length": 1844.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 1765.6875, "completions/mean_terminated_length": 1765.6875, "completions/min_length": 1636.0, "completions/min_terminated_length": 1636.0, "entropy": 0.03622563788667321, "epoch": 2.7240001089600042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006819043308496475, "kl": 0.018090936122462153, "learning_rate": 7.46297999770177e-06, "loss": 0.0002, "num_tokens": 16524854.0, "reward": -7.792394638061523, "reward_std": 9.99161148071289, "rewards/rollout_reward_func/mean": -7.792394638061523, "rewards/rollout_reward_func/std": 9.99161148071289, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.0625, "sampling/sampling_logp_difference/mean": 0.2791416645050049, "step": 681, "step_time": 26.015325430984376 }, { "clip_ratio/high_max": 0.003985948656918481, "clip_ratio/high_mean": 0.003985948656918481, "clip_ratio/low_mean": 0.004417810487211682, "clip_ratio/low_min": 0.004417810487211682, "clip_ratio/region_mean": 0.008403759158682078, "completions/clipped_ratio": 0.0, "completions/max_length": 1750.0, "completions/max_terminated_length": 1750.0, "completions/mean_length": 1597.3125, "completions/mean_terminated_length": 1597.3125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.04351601283997297, "epoch": 2.7280001091200043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003961034119129181, "kl": 0.01862851600162685, "learning_rate": 7.462979997694639e-06, "loss": 0.0001, "num_tokens": 16563286.0, "reward": 5.871960639953613, "reward_std": 34.21082305908203, "rewards/rollout_reward_func/mean": 5.871960639953613, "rewards/rollout_reward_func/std": 34.21082305908203, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.40625, "sampling/sampling_logp_difference/mean": 0.3019711375236511, "step": 682, "step_time": 25.11984570700588 }, { "clip_ratio/high_max": 0.0061881109431851655, "clip_ratio/high_mean": 0.0061881109431851655, "clip_ratio/low_mean": 0.0033979889703914523, "clip_ratio/low_min": 0.0033979889703914523, "clip_ratio/region_mean": 0.009586099826265126, "completions/clipped_ratio": 0.0, "completions/max_length": 1855.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 1719.9375, "completions/mean_terminated_length": 1719.9375, "completions/min_length": 1648.0, "completions/min_terminated_length": 1648.0, "entropy": 0.03601082321256399, "epoch": 2.7320001092800044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033668233081698418, "kl": 0.017640132806263864, "learning_rate": 7.4629799976874965e-06, "loss": 0.0001, "num_tokens": 16603695.0, "reward": -7.674644470214844, "reward_std": 6.064459323883057, "rewards/rollout_reward_func/mean": -7.674644470214844, "rewards/rollout_reward_func/std": 6.064459800720215, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.8125, "sampling/sampling_logp_difference/mean": 0.2822436988353729, "step": 683, "step_time": 26.514715701006935 }, { "clip_ratio/high_max": 0.002735540736466646, "clip_ratio/high_mean": 0.002735540736466646, "clip_ratio/low_mean": 0.005474448116729036, "clip_ratio/low_min": 0.005474448116729036, "clip_ratio/region_mean": 0.008209988824091852, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1737.375, "completions/mean_terminated_length": 1737.375, "completions/min_length": 1649.0, "completions/min_terminated_length": 1649.0, "entropy": 0.03760718833655119, "epoch": 2.7360001094400045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028499492909759283, "kl": 0.01712925359606743, "learning_rate": 7.462979997680342e-06, "loss": 0.0001, "num_tokens": 16644392.0, "reward": -6.894408702850342, "reward_std": 13.289104461669922, "rewards/rollout_reward_func/mean": -6.894408702850342, "rewards/rollout_reward_func/std": 13.289104461669922, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.5, "sampling/sampling_logp_difference/mean": 0.2841295003890991, "step": 684, "step_time": 26.356027542999072 }, { "clip_ratio/high_max": 0.004781413488672115, "clip_ratio/high_mean": 0.004781413488672115, "clip_ratio/low_mean": 0.003309428022475913, "clip_ratio/low_min": 0.003309428022475913, "clip_ratio/region_mean": 0.008090841583907604, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 1759.75, "completions/mean_terminated_length": 1759.75, "completions/min_length": 1615.0, "completions/min_terminated_length": 1615.0, "entropy": 0.035462961765006185, "epoch": 2.7400001096000042e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.015068304724991322, "kl": 0.022000275552272797, "learning_rate": 7.462979997673177e-06, "loss": 0.0002, "num_tokens": 16685455.0, "reward": -6.9336628913879395, "reward_std": 9.24376106262207, "rewards/rollout_reward_func/mean": -6.9336628913879395, "rewards/rollout_reward_func/std": 9.243760108947754, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.96875, "sampling/sampling_logp_difference/mean": 0.27584129571914673, "step": 685, "step_time": 26.166303005986265 }, { "clip_ratio/high_max": 0.003861590346787125, "clip_ratio/high_mean": 0.003861590346787125, "clip_ratio/low_mean": 0.004964175779605284, "clip_ratio/low_min": 0.004964175779605284, "clip_ratio/region_mean": 0.008825765980873257, "completions/clipped_ratio": 0.0, "completions/max_length": 1828.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 1752.3125, "completions/mean_terminated_length": 1752.3125, "completions/min_length": 1662.0, "completions/min_terminated_length": 1662.0, "entropy": 0.034080683486536145, "epoch": 2.7440001097600043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027994203846901655, "kl": 0.015433944994583726, "learning_rate": 7.462979997666001e-06, "loss": 0.0001, "num_tokens": 16726402.0, "reward": -4.182677745819092, "reward_std": 11.915998458862305, "rewards/rollout_reward_func/mean": -4.182677745819092, "rewards/rollout_reward_func/std": 11.915998458862305, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.1796875, "sampling/sampling_logp_difference/mean": 0.28645241260528564, "step": 686, "step_time": 26.344369202000962 }, { "clip_ratio/high_max": 0.0038245526084210724, "clip_ratio/high_mean": 0.0038245526084210724, "clip_ratio/low_mean": 0.00507475042832084, "clip_ratio/low_min": 0.00507475042832084, "clip_ratio/region_mean": 0.008899303153157234, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 1704.0, "completions/mean_terminated_length": 1704.0, "completions/min_length": 1462.0, "completions/min_terminated_length": 1462.0, "entropy": 0.03889825800433755, "epoch": 2.7480001099200044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008049765601754189, "kl": 0.01919457211624831, "learning_rate": 7.4629799976588135e-06, "loss": 0.0002, "num_tokens": 16766569.0, "reward": -8.38805866241455, "reward_std": 13.034424781799316, "rewards/rollout_reward_func/mean": -8.38805866241455, "rewards/rollout_reward_func/std": 13.034423828125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.703125, "sampling/sampling_logp_difference/mean": 0.2858528196811676, "step": 687, "step_time": 25.750122666999232 }, { "clip_ratio/high_max": 0.004785444121807814, "clip_ratio/high_mean": 0.004785444121807814, "clip_ratio/low_mean": 0.0041716402338352054, "clip_ratio/low_min": 0.0041716402338352054, "clip_ratio/region_mean": 0.008957084326539189, "completions/clipped_ratio": 0.0, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 1744.1875, "completions/mean_terminated_length": 1744.1875, "completions/min_length": 1411.0, "completions/min_terminated_length": 1411.0, "entropy": 0.03943176195025444, "epoch": 2.7520001100800045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004707835614681244, "kl": 0.016747135552577674, "learning_rate": 7.462979997651615e-06, "loss": 0.0001, "num_tokens": 16807384.0, "reward": 0.1127631664276123, "reward_std": 16.841794967651367, "rewards/rollout_reward_func/mean": 0.1127631664276123, "rewards/rollout_reward_func/std": 16.841796875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.95313262939453, "sampling/sampling_logp_difference/mean": 0.2923858165740967, "step": 688, "step_time": 26.040198351009167 }, { "clip_ratio/high_max": 0.004485530313104391, "clip_ratio/high_mean": 0.004485530313104391, "clip_ratio/low_mean": 0.004196162422886118, "clip_ratio/low_min": 0.004196162422886118, "clip_ratio/region_mean": 0.008681692823302, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 1721.3125, "completions/mean_terminated_length": 1721.3125, "completions/min_length": 1594.0, "completions/min_terminated_length": 1594.0, "entropy": 0.03736831806600094, "epoch": 2.7560001102400046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024406956508755684, "kl": 0.017248244141228497, "learning_rate": 7.462979997644406e-06, "loss": 0.0001, "num_tokens": 16847818.0, "reward": -3.8419458866119385, "reward_std": 6.795658588409424, "rewards/rollout_reward_func/mean": -3.8419458866119385, "rewards/rollout_reward_func/std": 6.795659065246582, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.03125, "sampling/sampling_logp_difference/mean": 0.28607043623924255, "step": 689, "step_time": 26.210866783010715 }, { "clip_ratio/high_max": 0.0038811850536148995, "clip_ratio/high_mean": 0.0038811850536148995, "clip_ratio/low_mean": 0.004998485121177509, "clip_ratio/low_min": 0.004998485121177509, "clip_ratio/region_mean": 0.00887967023300007, "completions/clipped_ratio": 0.0, "completions/max_length": 1850.0, "completions/max_terminated_length": 1850.0, "completions/mean_length": 1725.5625, "completions/mean_terminated_length": 1725.5625, "completions/min_length": 1649.0, "completions/min_terminated_length": 1649.0, "entropy": 0.0359183638356626, "epoch": 2.7600001104000043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007270915433764458, "kl": 0.017826712457463145, "learning_rate": 7.462979997637187e-06, "loss": 0.0002, "num_tokens": 16888314.0, "reward": -6.9817328453063965, "reward_std": 6.116753578186035, "rewards/rollout_reward_func/mean": -6.9817328453063965, "rewards/rollout_reward_func/std": 6.116753101348877, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.9375, "sampling/sampling_logp_difference/mean": 0.286639004945755, "step": 690, "step_time": 26.300802269994165 }, { "clip_ratio/high_max": 0.004736144852358848, "clip_ratio/high_mean": 0.004736144852358848, "clip_ratio/low_mean": 0.005092929844977334, "clip_ratio/low_min": 0.005092929844977334, "clip_ratio/region_mean": 0.009829074726440012, "completions/clipped_ratio": 0.0, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 1735.875, "completions/mean_terminated_length": 1735.875, "completions/min_length": 1608.0, "completions/min_terminated_length": 1608.0, "entropy": 0.03639492625370622, "epoch": 2.7640001105600044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033326195552945137, "kl": 0.016307767131365836, "learning_rate": 7.462979997629955e-06, "loss": 0.0001, "num_tokens": 16928977.0, "reward": -3.340097427368164, "reward_std": 5.455414295196533, "rewards/rollout_reward_func/mean": -3.340097427368164, "rewards/rollout_reward_func/std": 5.455414295196533, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.25, "sampling/sampling_logp_difference/mean": 0.290692538022995, "step": 691, "step_time": 26.07407844800764 }, { "clip_ratio/high_max": 0.0054763877706136554, "clip_ratio/high_mean": 0.0054763877706136554, "clip_ratio/low_mean": 0.004257397959008813, "clip_ratio/low_min": 0.004257397959008813, "clip_ratio/region_mean": 0.00973378581693396, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1701.8125, "completions/mean_terminated_length": 1701.8125, "completions/min_length": 1621.0, "completions/min_terminated_length": 1621.0, "entropy": 0.03757258364930749, "epoch": 2.7680001107200045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003248698078095913, "kl": 0.017815021332353354, "learning_rate": 7.462979997622713e-06, "loss": 0.0001, "num_tokens": 16969088.0, "reward": -7.728740692138672, "reward_std": 6.7594895362854, "rewards/rollout_reward_func/mean": -7.728740692138672, "rewards/rollout_reward_func/std": 6.7594895362854, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.421875, "sampling/sampling_logp_difference/mean": 0.275755912065506, "step": 692, "step_time": 25.614413659008278 }, { "clip_ratio/high_max": 0.005931324267294258, "clip_ratio/high_mean": 0.005931324267294258, "clip_ratio/low_mean": 0.0033115522819571197, "clip_ratio/low_min": 0.0033115522819571197, "clip_ratio/region_mean": 0.009242876432836056, "completions/clipped_ratio": 0.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 1792.625, "completions/mean_terminated_length": 1792.625, "completions/min_length": 1727.0, "completions/min_terminated_length": 1727.0, "entropy": 0.03465356212109327, "epoch": 2.7720001108800045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01132353488355875, "kl": 0.01921924715861678, "learning_rate": 7.46297999761546e-06, "loss": 0.0002, "num_tokens": 17010714.0, "reward": -10.078598022460938, "reward_std": 11.82994270324707, "rewards/rollout_reward_func/mean": -10.078598022460938, "rewards/rollout_reward_func/std": 11.82994270324707, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.21875, "sampling/sampling_logp_difference/mean": 0.27807798981666565, "step": 693, "step_time": 26.00695588499366 }, { "clip_ratio/high_max": 0.0049367109895683825, "clip_ratio/high_mean": 0.0049367109895683825, "clip_ratio/low_mean": 0.0038658115954604, "clip_ratio/low_min": 0.0038658115954604, "clip_ratio/region_mean": 0.008802522555924952, "completions/clipped_ratio": 0.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 1733.9375, "completions/mean_terminated_length": 1733.9375, "completions/min_length": 1637.0, "completions/min_terminated_length": 1637.0, "entropy": 0.03654080489650369, "epoch": 2.7760001110400043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007469845935702324, "kl": 0.016299029579386115, "learning_rate": 7.462979997608196e-06, "loss": 0.0001, "num_tokens": 17051367.0, "reward": -6.322315692901611, "reward_std": 10.546448707580566, "rewards/rollout_reward_func/mean": -6.322315692901611, "rewards/rollout_reward_func/std": 10.546449661254883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.34375, "sampling/sampling_logp_difference/mean": 0.2890617549419403, "step": 694, "step_time": 26.51256977999583 }, { "clip_ratio/high_max": 0.002702715602936223, "clip_ratio/high_mean": 0.002702715602936223, "clip_ratio/low_mean": 0.0060425457486417145, "clip_ratio/low_min": 0.0060425457486417145, "clip_ratio/region_mean": 0.008745261293370277, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1654.625, "completions/mean_terminated_length": 1654.625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.03747056005522609, "epoch": 2.7800001112000044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036766782868653536, "kl": 0.013905706582590938, "learning_rate": 7.46297999760092e-06, "loss": 0.0001, "num_tokens": 17090764.0, "reward": 1.704268217086792, "reward_std": 27.881946563720703, "rewards/rollout_reward_func/mean": 1.704268217086792, "rewards/rollout_reward_func/std": 27.881948471069336, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.6875, "sampling/sampling_logp_difference/mean": 0.29298558831214905, "step": 695, "step_time": 25.259146110001893 }, { "clip_ratio/high_max": 0.0033602955227252096, "clip_ratio/high_mean": 0.0033602955227252096, "clip_ratio/low_mean": 0.005005645740311593, "clip_ratio/low_min": 0.005005645740311593, "clip_ratio/region_mean": 0.008365941233932972, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1747.1875, "completions/mean_terminated_length": 1747.1875, "completions/min_length": 1418.0, "completions/min_terminated_length": 1418.0, "entropy": 0.03614582307636738, "epoch": 2.7840001113600044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0058920495212078094, "kl": 0.016385919181630015, "learning_rate": 7.4629799975936334e-06, "loss": 0.0001, "num_tokens": 17131632.0, "reward": -1.441832423210144, "reward_std": 20.897916793823242, "rewards/rollout_reward_func/mean": -1.441832423210144, "rewards/rollout_reward_func/std": 20.897920608520508, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.125, "sampling/sampling_logp_difference/mean": 0.29845917224884033, "step": 696, "step_time": 26.164863704005256 }, { "clip_ratio/high_max": 0.005330827902071178, "clip_ratio/high_mean": 0.005330827902071178, "clip_ratio/low_mean": 0.003659392532426864, "clip_ratio/low_min": 0.003659392532426864, "clip_ratio/region_mean": 0.008990220492705703, "completions/clipped_ratio": 0.0, "completions/max_length": 1839.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 1724.1875, "completions/mean_terminated_length": 1724.1875, "completions/min_length": 1632.0, "completions/min_terminated_length": 1632.0, "entropy": 0.03658610465936363, "epoch": 2.7880001115200045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004780895542353392, "kl": 0.017091812333092093, "learning_rate": 7.462979997586335e-06, "loss": 0.0001, "num_tokens": 17172121.0, "reward": -0.32833242416381836, "reward_std": 9.14657211303711, "rewards/rollout_reward_func/mean": -0.32833242416381836, "rewards/rollout_reward_func/std": 9.14657211303711, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.15625, "sampling/sampling_logp_difference/mean": 0.29403242468833923, "step": 697, "step_time": 26.345014814993192 }, { "clip_ratio/high_max": 0.005055132816778496, "clip_ratio/high_mean": 0.005055132816778496, "clip_ratio/low_mean": 0.0031549520790576935, "clip_ratio/low_min": 0.0031549520790576935, "clip_ratio/region_mean": 0.008210084866732359, "completions/clipped_ratio": 0.0, "completions/max_length": 1831.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 1759.125, "completions/mean_terminated_length": 1759.125, "completions/min_length": 1707.0, "completions/min_terminated_length": 1707.0, "entropy": 0.03526385361328721, "epoch": 2.7920001116800046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030485137831419706, "kl": 0.016001854208298028, "learning_rate": 7.462979997579027e-06, "loss": 0.0001, "num_tokens": 17213174.0, "reward": -4.350801467895508, "reward_std": 10.347742080688477, "rewards/rollout_reward_func/mean": -4.350801467895508, "rewards/rollout_reward_func/std": 10.347742080688477, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.38282012939453, "sampling/sampling_logp_difference/mean": 0.28926631808280945, "step": 698, "step_time": 26.266788736997114 }, { "clip_ratio/high_max": 0.004030425858218223, "clip_ratio/high_mean": 0.004030425858218223, "clip_ratio/low_mean": 0.004653142241295427, "clip_ratio/low_min": 0.004653142241295427, "clip_ratio/region_mean": 0.008683568041305989, "completions/clipped_ratio": 0.0, "completions/max_length": 1878.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 1764.0, "completions/mean_terminated_length": 1764.0, "completions/min_length": 1649.0, "completions/min_terminated_length": 1649.0, "entropy": 0.03620460187084973, "epoch": 2.7960001118400043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025653657503426075, "kl": 0.013520702603273094, "learning_rate": 7.462979997571707e-06, "loss": 0.0001, "num_tokens": 17254314.0, "reward": -4.063343048095703, "reward_std": 8.397793769836426, "rewards/rollout_reward_func/mean": -4.063343048095703, "rewards/rollout_reward_func/std": 8.397793769836426, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.5, "sampling/sampling_logp_difference/mean": 0.28074541687965393, "step": 699, "step_time": 26.45811134300311 }, { "clip_ratio/high_max": 0.00560558756114915, "clip_ratio/high_mean": 0.00560558756114915, "clip_ratio/low_mean": 0.003151353681460023, "clip_ratio/low_min": 0.003151353681460023, "clip_ratio/region_mean": 0.00875694106798619, "completions/clipped_ratio": 0.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 1727.9375, "completions/mean_terminated_length": 1727.9375, "completions/min_length": 1635.0, "completions/min_terminated_length": 1635.0, "entropy": 0.036716885631904006, "epoch": 2.8000001120000044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.18324187397956848, "kl": 0.05389723915141076, "learning_rate": 7.462979997564377e-06, "loss": 0.0005, "num_tokens": 17294858.0, "reward": -5.648506164550781, "reward_std": 6.3336992263793945, "rewards/rollout_reward_func/mean": -5.648506164550781, "rewards/rollout_reward_func/std": 6.333699703216553, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.6875, "sampling/sampling_logp_difference/mean": 0.29165297746658325, "step": 700, "step_time": 26.23234051799227 }, { "clip_ratio/high_max": 0.003733556834049523, "clip_ratio/high_mean": 0.003733556834049523, "clip_ratio/low_mean": 0.0037053927371744066, "clip_ratio/low_min": 0.0037053927371744066, "clip_ratio/region_mean": 0.00743894960032776, "completions/clipped_ratio": 0.0, "completions/max_length": 1755.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 1689.625, "completions/mean_terminated_length": 1689.625, "completions/min_length": 1608.0, "completions/min_terminated_length": 1608.0, "entropy": 0.03725800197571516, "epoch": 2.8040001121600045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025287640746682882, "kl": 0.01568790222518146, "learning_rate": 7.462979997557036e-06, "loss": 0.0001, "num_tokens": 17334768.0, "reward": -5.638495445251465, "reward_std": 5.657009601593018, "rewards/rollout_reward_func/mean": -5.638495445251465, "rewards/rollout_reward_func/std": 5.657010078430176, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.64863586425781, "sampling/sampling_logp_difference/mean": 0.2862299978733063, "step": 701, "step_time": 25.621037541997794 }, { "clip_ratio/high_max": 0.004494867171160877, "clip_ratio/high_mean": 0.004494867171160877, "clip_ratio/low_mean": 0.004171730179223232, "clip_ratio/low_min": 0.004171730179223232, "clip_ratio/region_mean": 0.008666597364936024, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 1688.625, "completions/mean_terminated_length": 1688.625, "completions/min_length": 1611.0, "completions/min_terminated_length": 1611.0, "entropy": 0.04039548709988594, "epoch": 2.8080001123200046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010303214192390442, "kl": 0.017083141836337745, "learning_rate": 7.462979997549682e-06, "loss": 0.0001, "num_tokens": 17374662.0, "reward": -2.9917678833007812, "reward_std": 9.182687759399414, "rewards/rollout_reward_func/mean": -2.9917678833007812, "rewards/rollout_reward_func/std": 9.182687759399414, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.96875, "sampling/sampling_logp_difference/mean": 0.28929603099823, "step": 702, "step_time": 25.687767691000772 }, { "clip_ratio/high_max": 0.004276641644537449, "clip_ratio/high_mean": 0.004276641644537449, "clip_ratio/low_mean": 0.004582961759297177, "clip_ratio/low_min": 0.004582961759297177, "clip_ratio/region_mean": 0.008859603374730796, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1749.0, "completions/mean_terminated_length": 1749.0, "completions/min_length": 1645.0, "completions/min_terminated_length": 1645.0, "entropy": 0.034986002603545785, "epoch": 2.8120001124800043e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005285011138767004, "kl": 0.01660403097048402, "learning_rate": 7.4629799975423185e-06, "loss": 0.0001, "num_tokens": 17415563.0, "reward": -5.372383117675781, "reward_std": 10.14961051940918, "rewards/rollout_reward_func/mean": -5.372383117675781, "rewards/rollout_reward_func/std": 10.14961051940918, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.25390625, "sampling/sampling_logp_difference/mean": 0.2815955877304077, "step": 703, "step_time": 26.060200027000974 }, { "clip_ratio/high_max": 0.004200365481665358, "clip_ratio/high_mean": 0.004200365481665358, "clip_ratio/low_mean": 0.004821952519705519, "clip_ratio/low_min": 0.004821952519705519, "clip_ratio/region_mean": 0.009022318001370877, "completions/clipped_ratio": 0.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 1739.8125, "completions/mean_terminated_length": 1739.8125, "completions/min_length": 1631.0, "completions/min_terminated_length": 1631.0, "entropy": 0.03591364622116089, "epoch": 2.8160001126400044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003164435736835003, "kl": 0.015364939346909523, "learning_rate": 7.4629799975349425e-06, "loss": 0.0001, "num_tokens": 17456306.0, "reward": -2.7570948600769043, "reward_std": 14.229338645935059, "rewards/rollout_reward_func/mean": -2.7570948600769043, "rewards/rollout_reward_func/std": 14.229338645935059, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.125, "sampling/sampling_logp_difference/mean": 0.2848498523235321, "step": 704, "step_time": 26.424943166006415 }, { "clip_ratio/high_max": 0.004050119721796364, "clip_ratio/high_mean": 0.004050119721796364, "clip_ratio/low_mean": 0.003592879220377654, "clip_ratio/low_min": 0.003592879220377654, "clip_ratio/region_mean": 0.007642999000381678, "completions/clipped_ratio": 0.0, "completions/max_length": 1884.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 1761.375, "completions/mean_terminated_length": 1761.375, "completions/min_length": 1616.0, "completions/min_terminated_length": 1616.0, "entropy": 0.034034299198538065, "epoch": 2.8200001128000045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008111115545034409, "kl": 0.013707077829167247, "learning_rate": 7.462979997527558e-06, "loss": 0.0001, "num_tokens": 17497404.0, "reward": -0.6888279914855957, "reward_std": 7.422823429107666, "rewards/rollout_reward_func/mean": -0.6888279914855957, "rewards/rollout_reward_func/std": 7.422823429107666, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.625, "sampling/sampling_logp_difference/mean": 0.2784445285797119, "step": 705, "step_time": 26.715217409000616 }, { "clip_ratio/high_max": 0.0036675323790404946, "clip_ratio/high_mean": 0.0036675323790404946, "clip_ratio/low_mean": 0.00556004056124948, "clip_ratio/low_min": 0.00556004056124948, "clip_ratio/region_mean": 0.009227572940289974, "completions/clipped_ratio": 0.0, "completions/max_length": 1849.0, "completions/max_terminated_length": 1849.0, "completions/mean_length": 1758.8125, "completions/mean_terminated_length": 1758.8125, "completions/min_length": 1636.0, "completions/min_terminated_length": 1636.0, "entropy": 0.034985841484740376, "epoch": 2.8240001129600046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026001124642789364, "kl": 0.014955706777982414, "learning_rate": 7.462979997520161e-06, "loss": 0.0001, "num_tokens": 17538466.0, "reward": -5.967660903930664, "reward_std": 8.531668663024902, "rewards/rollout_reward_func/mean": -5.967660903930664, "rewards/rollout_reward_func/std": 8.531668663024902, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.078147888183594, "sampling/sampling_logp_difference/mean": 0.28144389390945435, "step": 706, "step_time": 26.568785244991886 }, { "clip_ratio/high_max": 0.0050896944012492895, "clip_ratio/high_mean": 0.0050896944012492895, "clip_ratio/low_mean": 0.0037910049431957304, "clip_ratio/low_min": 0.0037910049431957304, "clip_ratio/region_mean": 0.00888069934444502, "completions/clipped_ratio": 0.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 1749.5, "completions/mean_terminated_length": 1749.5, "completions/min_length": 1499.0, "completions/min_terminated_length": 1499.0, "entropy": 0.03668242134153843, "epoch": 2.8280001131200047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002234639599919319, "kl": 0.01495248218998313, "learning_rate": 7.462979997512752e-06, "loss": 0.0001, "num_tokens": 17579368.0, "reward": -3.6119823455810547, "reward_std": 16.90263557434082, "rewards/rollout_reward_func/mean": -3.6119823455810547, "rewards/rollout_reward_func/std": 16.90263557434082, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.4375, "sampling/sampling_logp_difference/mean": 0.29206976294517517, "step": 707, "step_time": 26.312764830996457 }, { "clip_ratio/high_max": 0.005321014381479472, "clip_ratio/high_mean": 0.005321014381479472, "clip_ratio/low_mean": 0.003656532848253846, "clip_ratio/low_min": 0.003656532848253846, "clip_ratio/region_mean": 0.008977547229733318, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 1604.5625, "completions/mean_terminated_length": 1604.5625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.03916760813444853, "epoch": 2.8320001132800044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005604796577244997, "kl": 0.0173538604285568, "learning_rate": 7.462979997505334e-06, "loss": 0.0001, "num_tokens": 17617919.0, "reward": 0.7150709629058838, "reward_std": 25.460460662841797, "rewards/rollout_reward_func/mean": 0.7150709629058838, "rewards/rollout_reward_func/std": 25.460460662841797, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.171875, "sampling/sampling_logp_difference/mean": 0.29492032527923584, "step": 708, "step_time": 29.586737301993708 }, { "clip_ratio/high_max": 0.002723493002122268, "clip_ratio/high_mean": 0.002723493002122268, "clip_ratio/low_mean": 0.00572101995931007, "clip_ratio/low_min": 0.00572101995931007, "clip_ratio/region_mean": 0.008444512961432338, "completions/clipped_ratio": 0.0, "completions/max_length": 1866.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 1633.9375, "completions/mean_terminated_length": 1633.9375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.03706885688006878, "epoch": 2.8360001134400045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.06757739186286926, "kl": 0.03058292856439948, "learning_rate": 7.4629799974979035e-06, "loss": 0.0003, "num_tokens": 17656954.0, "reward": 1.790281891822815, "reward_std": 33.43865203857422, "rewards/rollout_reward_func/mean": 1.790281891822815, "rewards/rollout_reward_func/std": 33.43865203857422, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.140625, "sampling/sampling_logp_difference/mean": 0.2901895344257355, "step": 709, "step_time": 25.716190426996036 }, { "clip_ratio/high_max": 0.0048227651859633625, "clip_ratio/high_mean": 0.0048227651859633625, "clip_ratio/low_mean": 0.0038927650603000075, "clip_ratio/low_min": 0.0038927650603000075, "clip_ratio/region_mean": 0.0087155302753672, "completions/clipped_ratio": 0.0, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 1719.5, "completions/mean_terminated_length": 1719.5, "completions/min_length": 1628.0, "completions/min_terminated_length": 1628.0, "entropy": 0.036960648372769356, "epoch": 2.8400001136000046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007690368220210075, "kl": 0.019205139949917793, "learning_rate": 7.462979997490462e-06, "loss": 0.0002, "num_tokens": 17697349.0, "reward": -6.026041030883789, "reward_std": 7.646664142608643, "rewards/rollout_reward_func/mean": -6.026041030883789, "rewards/rollout_reward_func/std": 7.646664142608643, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.5625, "sampling/sampling_logp_difference/mean": 0.28614649176597595, "step": 710, "step_time": 26.314058411000588 }, { "clip_ratio/high_max": 0.006452766509028152, "clip_ratio/high_mean": 0.006452766509028152, "clip_ratio/low_mean": 0.0027949782670475543, "clip_ratio/low_min": 0.0027949782670475543, "clip_ratio/region_mean": 0.009247744688764215, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 1695.6875, "completions/mean_terminated_length": 1695.6875, "completions/min_length": 1595.0, "completions/min_terminated_length": 1595.0, "entropy": 0.03807407012209296, "epoch": 2.8440001137600046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004121293779462576, "kl": 0.014305102289654315, "learning_rate": 7.46297999748301e-06, "loss": 0.0001, "num_tokens": 17737356.0, "reward": -6.684260368347168, "reward_std": 8.76071548461914, "rewards/rollout_reward_func/mean": -6.684260368347168, "rewards/rollout_reward_func/std": 8.76071548461914, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.60940170288086, "sampling/sampling_logp_difference/mean": 0.2820724844932556, "step": 711, "step_time": 26.21892978300457 }, { "clip_ratio/high_max": 0.003726881666807458, "clip_ratio/high_mean": 0.003726881666807458, "clip_ratio/low_mean": 0.0036581082677002996, "clip_ratio/low_min": 0.0036581082677002996, "clip_ratio/region_mean": 0.007384989992715418, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 1532.0625, "completions/mean_terminated_length": 1532.0625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.03821260901167989, "epoch": 2.8480001139200047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005507213529199362, "kl": 0.01613979961257428, "learning_rate": 7.462979997475547e-06, "loss": 0.0001, "num_tokens": 17774771.0, "reward": 4.516274452209473, "reward_std": 32.53440475463867, "rewards/rollout_reward_func/mean": 4.516274452209473, "rewards/rollout_reward_func/std": 32.53440856933594, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.5625, "sampling/sampling_logp_difference/mean": 0.2963443100452423, "step": 712, "step_time": 24.985365427011857 }, { "clip_ratio/high_max": 0.0035747848742175847, "clip_ratio/high_mean": 0.0035747848742175847, "clip_ratio/low_mean": 0.005207001726375893, "clip_ratio/low_min": 0.005207001726375893, "clip_ratio/region_mean": 0.008781786600593477, "completions/clipped_ratio": 0.0, "completions/max_length": 1864.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 1625.8125, "completions/mean_terminated_length": 1625.8125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.03881912352517247, "epoch": 2.8520001140800045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004570795223116875, "kl": 0.01584895176347345, "learning_rate": 7.462979997468073e-06, "loss": 0.0001, "num_tokens": 17813681.0, "reward": -1.110954761505127, "reward_std": 25.94501304626465, "rewards/rollout_reward_func/mean": -1.110954761505127, "rewards/rollout_reward_func/std": 25.94501495361328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.875, "sampling/sampling_logp_difference/mean": 0.28713586926460266, "step": 713, "step_time": 25.653579844991327 }, { "clip_ratio/high_max": 0.005398391804192215, "clip_ratio/high_mean": 0.005398391804192215, "clip_ratio/low_mean": 0.002669877780135721, "clip_ratio/low_min": 0.002669877780135721, "clip_ratio/region_mean": 0.008068269584327936, "completions/clipped_ratio": 0.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 1767.4375, "completions/mean_terminated_length": 1767.4375, "completions/min_length": 1640.0, "completions/min_terminated_length": 1640.0, "entropy": 0.035518016666173935, "epoch": 2.8560001142400045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002467378042638302, "kl": 0.01351329276803881, "learning_rate": 7.462979997460587e-06, "loss": 0.0001, "num_tokens": 17854886.0, "reward": -7.0735602378845215, "reward_std": 11.161581039428711, "rewards/rollout_reward_func/mean": -7.0735602378845215, "rewards/rollout_reward_func/std": 11.161581993103027, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.281253814697266, "sampling/sampling_logp_difference/mean": 0.2737146317958832, "step": 714, "step_time": 26.517093766015023 }, { "clip_ratio/high_max": 0.004244705924065784, "clip_ratio/high_mean": 0.004244705924065784, "clip_ratio/low_mean": 0.004351138835772872, "clip_ratio/low_min": 0.004351138835772872, "clip_ratio/region_mean": 0.008595844788942486, "completions/clipped_ratio": 0.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 1700.6875, "completions/mean_terminated_length": 1700.6875, "completions/min_length": 1642.0, "completions/min_terminated_length": 1642.0, "entropy": 0.03699561767280102, "epoch": 2.8600001144000046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.014959493651986122, "kl": 0.016636019223369658, "learning_rate": 7.46297999745309e-06, "loss": 0.0001, "num_tokens": 17894971.0, "reward": -2.0769166946411133, "reward_std": 12.046476364135742, "rewards/rollout_reward_func/mean": -2.0769166946411133, "rewards/rollout_reward_func/std": 12.046477317810059, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.484378814697266, "sampling/sampling_logp_difference/mean": 0.30028966069221497, "step": 715, "step_time": 25.643838390999008 }, { "clip_ratio/high_max": 0.002986903360579163, "clip_ratio/high_mean": 0.002986903360579163, "clip_ratio/low_mean": 0.005247306515229866, "clip_ratio/low_min": 0.005247306515229866, "clip_ratio/region_mean": 0.008234209846705198, "completions/clipped_ratio": 0.0, "completions/max_length": 1852.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 1768.8125, "completions/mean_terminated_length": 1768.8125, "completions/min_length": 1652.0, "completions/min_terminated_length": 1652.0, "entropy": 0.03646427346393466, "epoch": 2.8640001145600047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005754503421485424, "kl": 0.0158776551252231, "learning_rate": 7.462979997445583e-06, "loss": 0.0001, "num_tokens": 17936199.0, "reward": -7.364344596862793, "reward_std": 8.379446983337402, "rewards/rollout_reward_func/mean": -7.364344596862793, "rewards/rollout_reward_func/std": 8.379447937011719, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.8125, "sampling/sampling_logp_difference/mean": 0.2821119725704193, "step": 716, "step_time": 26.276117280991457 }, { "clip_ratio/high_max": 0.005346951482351869, "clip_ratio/high_mean": 0.005346951482351869, "clip_ratio/low_mean": 0.0036583376640919596, "clip_ratio/low_min": 0.0036583376640919596, "clip_ratio/region_mean": 0.009005289059132338, "completions/clipped_ratio": 0.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 1722.625, "completions/mean_terminated_length": 1722.625, "completions/min_length": 1623.0, "completions/min_terminated_length": 1623.0, "entropy": 0.03632170846685767, "epoch": 2.8680001147200044e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003888615407049656, "kl": 0.015678404946811497, "learning_rate": 7.462979997438065e-06, "loss": 0.0001, "num_tokens": 17976658.0, "reward": -3.6587166786193848, "reward_std": 16.400880813598633, "rewards/rollout_reward_func/mean": -3.6587166786193848, "rewards/rollout_reward_func/std": 16.400880813598633, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.736045837402344, "sampling/sampling_logp_difference/mean": 0.2878382205963135, "step": 717, "step_time": 26.43039269700239 }, { "clip_ratio/high_max": 0.00384653641958721, "clip_ratio/high_mean": 0.00384653641958721, "clip_ratio/low_mean": 0.004585007292916998, "clip_ratio/low_min": 0.004585007292916998, "clip_ratio/region_mean": 0.008431543712504208, "completions/clipped_ratio": 0.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1677.3125, "completions/mean_terminated_length": 1677.3125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.03804178163409233, "epoch": 2.8720001148800045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034267606679350138, "kl": 0.014105600654147565, "learning_rate": 7.462979997430535e-06, "loss": 0.0001, "num_tokens": 18016431.0, "reward": -0.3108210563659668, "reward_std": 27.421478271484375, "rewards/rollout_reward_func/mean": -0.3108210563659668, "rewards/rollout_reward_func/std": 27.421480178833008, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.640625, "sampling/sampling_logp_difference/mean": 0.2819867432117462, "step": 718, "step_time": 25.69720179800788 }, { "clip_ratio/high_max": 0.004818375280592591, "clip_ratio/high_mean": 0.004818375280592591, "clip_ratio/low_mean": 0.004197687201667577, "clip_ratio/low_min": 0.004197687201667577, "clip_ratio/region_mean": 0.009016062482260168, "completions/clipped_ratio": 0.0, "completions/max_length": 1848.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 1747.1875, "completions/mean_terminated_length": 1747.1875, "completions/min_length": 1677.0, "completions/min_terminated_length": 1677.0, "entropy": 0.035173177253454924, "epoch": 2.8760001150400046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0044576688669621944, "kl": 0.0181316789239645, "learning_rate": 7.462979997422994e-06, "loss": 0.0002, "num_tokens": 18057279.0, "reward": -8.76343059539795, "reward_std": 9.136987686157227, "rewards/rollout_reward_func/mean": -8.76343059539795, "rewards/rollout_reward_func/std": 9.136988639831543, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.875, "sampling/sampling_logp_difference/mean": 0.27941298484802246, "step": 719, "step_time": 26.346516213998257 }, { "clip_ratio/high_max": 0.004108055669348687, "clip_ratio/high_mean": 0.004108055669348687, "clip_ratio/low_mean": 0.004887476214207709, "clip_ratio/low_min": 0.004887476214207709, "clip_ratio/region_mean": 0.008995531883556396, "completions/clipped_ratio": 0.0, "completions/max_length": 1845.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 1724.0, "completions/mean_terminated_length": 1724.0, "completions/min_length": 1550.0, "completions/min_terminated_length": 1550.0, "entropy": 0.036654414143413305, "epoch": 2.8800001152000047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035285688936710358, "kl": 0.015302018378861248, "learning_rate": 7.462979997415443e-06, "loss": 0.0001, "num_tokens": 18097756.0, "reward": -4.320125579833984, "reward_std": 10.10438346862793, "rewards/rollout_reward_func/mean": -4.320125579833984, "rewards/rollout_reward_func/std": 10.10438346862793, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.03181076049805, "sampling/sampling_logp_difference/mean": 0.28480902314186096, "step": 720, "step_time": 26.497975141006464 }, { "clip_ratio/high_max": 0.003886964754201472, "clip_ratio/high_mean": 0.003886964754201472, "clip_ratio/low_mean": 0.004892360302619636, "clip_ratio/low_min": 0.004892360302619636, "clip_ratio/region_mean": 0.008779325115028769, "completions/clipped_ratio": 0.0, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 1956.5, "completions/mean_terminated_length": 1956.5, "completions/min_length": 1856.0, "completions/min_terminated_length": 1856.0, "entropy": 0.03495317045599222, "epoch": 2.8840001153600048e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009256880730390549, "kl": 0.01714155077934265, "learning_rate": 7.46297999740788e-06, "loss": 0.0002, "num_tokens": 18141985.0, "reward": 0.30230194330215454, "reward_std": 15.167367935180664, "rewards/rollout_reward_func/mean": 0.30230194330215454, "rewards/rollout_reward_func/std": 15.16736888885498, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.000328063964844, "sampling/sampling_logp_difference/mean": 0.28071892261505127, "step": 721, "step_time": 29.454528501002642 }, { "clip_ratio/high_max": 0.003873690206091851, "clip_ratio/high_mean": 0.003873690206091851, "clip_ratio/low_mean": 0.0049953726411331445, "clip_ratio/low_min": 0.0049953726411331445, "clip_ratio/region_mean": 0.008869062818121165, "completions/clipped_ratio": 0.0, "completions/max_length": 1992.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 1912.75, "completions/mean_terminated_length": 1912.75, "completions/min_length": 1841.0, "completions/min_terminated_length": 1841.0, "entropy": 0.03668797085992992, "epoch": 2.8880001155200045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035540773533284664, "kl": 0.01595085731241852, "learning_rate": 7.4629799974003075e-06, "loss": 0.0001, "num_tokens": 18185485.0, "reward": -0.965377688407898, "reward_std": 12.1666259765625, "rewards/rollout_reward_func/mean": -0.965377688407898, "rewards/rollout_reward_func/std": 12.1666259765625, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.8359375, "sampling/sampling_logp_difference/mean": 0.28383976221084595, "step": 722, "step_time": 28.490392917999998 }, { "clip_ratio/high_max": 0.004480124945985153, "clip_ratio/high_mean": 0.004480124945985153, "clip_ratio/low_mean": 0.004266985197318718, "clip_ratio/low_min": 0.004266985197318718, "clip_ratio/region_mean": 0.00874711008509621, "completions/clipped_ratio": 0.0, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 1983.25, "completions/mean_terminated_length": 1983.25, "completions/min_length": 1813.0, "completions/min_terminated_length": 1813.0, "entropy": 0.03407833375968039, "epoch": 2.8920001156800046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004649181384593248, "kl": 0.014441720442846417, "learning_rate": 7.4629799973927214e-06, "loss": 0.0001, "num_tokens": 18230158.0, "reward": -3.7335362434387207, "reward_std": 9.185081481933594, "rewards/rollout_reward_func/mean": -3.7335362434387207, "rewards/rollout_reward_func/std": 9.185081481933594, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.0625, "sampling/sampling_logp_difference/mean": 0.269476056098938, "step": 723, "step_time": 29.204542406005203 }, { "clip_ratio/high_max": 0.0052465618355199695, "clip_ratio/high_mean": 0.0052465618355199695, "clip_ratio/low_mean": 0.003934489359380677, "clip_ratio/low_min": 0.003934489359380677, "clip_ratio/region_mean": 0.009181051165796816, "completions/clipped_ratio": 0.0, "completions/max_length": 1974.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1913.0, "completions/mean_terminated_length": 1913.0, "completions/min_length": 1848.0, "completions/min_terminated_length": 1848.0, "entropy": 0.03585662157274783, "epoch": 2.8960001158400047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004042237531393766, "kl": 0.012895889463834465, "learning_rate": 7.462979997385125e-06, "loss": 0.0001, "num_tokens": 18273662.0, "reward": -1.0387382507324219, "reward_std": 9.716029167175293, "rewards/rollout_reward_func/mean": -1.0387382507324219, "rewards/rollout_reward_func/std": 9.716029167175293, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.39794921875, "sampling/sampling_logp_difference/mean": 0.2875572443008423, "step": 724, "step_time": 28.63905061099649 }, { "clip_ratio/high_max": 0.004777330323122442, "clip_ratio/high_mean": 0.004777330323122442, "clip_ratio/low_mean": 0.0037926040240563452, "clip_ratio/low_min": 0.0037926040240563452, "clip_ratio/region_mean": 0.008569934230763465, "completions/clipped_ratio": 0.0, "completions/max_length": 2061.0, "completions/max_terminated_length": 2061.0, "completions/mean_length": 1966.9375, "completions/mean_terminated_length": 1966.9375, "completions/min_length": 1853.0, "completions/min_terminated_length": 1853.0, "entropy": 0.034054386895149946, "epoch": 2.9000001160000048e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027729482389986515, "kl": 0.013202973175793886, "learning_rate": 7.462979997377518e-06, "loss": 0.0001, "num_tokens": 18318062.0, "reward": 0.4004595875740051, "reward_std": 14.23034381866455, "rewards/rollout_reward_func/mean": 0.4004595875740051, "rewards/rollout_reward_func/std": 14.23034381866455, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.37535095214844, "sampling/sampling_logp_difference/mean": 0.2723841965198517, "step": 725, "step_time": 29.40168883500155 }, { "clip_ratio/high_max": 0.004584362846799195, "clip_ratio/high_mean": 0.004584362846799195, "clip_ratio/low_mean": 0.004336315440014005, "clip_ratio/low_min": 0.004336315440014005, "clip_ratio/region_mean": 0.008920678228605539, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1935.25, "completions/mean_terminated_length": 1935.25, "completions/min_length": 1858.0, "completions/min_terminated_length": 1858.0, "entropy": 0.034821435110643506, "epoch": 2.9040001161600045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.018249932676553726, "kl": 0.015813059988431633, "learning_rate": 7.462979997369901e-06, "loss": 0.0001, "num_tokens": 18361940.0, "reward": 1.6882832050323486, "reward_std": 14.044145584106445, "rewards/rollout_reward_func/mean": 1.6882832050323486, "rewards/rollout_reward_func/std": 14.044145584106445, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.146484375, "sampling/sampling_logp_difference/mean": 0.27622559666633606, "step": 726, "step_time": 28.88260069300304 }, { "clip_ratio/high_max": 0.0037803653976880014, "clip_ratio/high_mean": 0.0037803653976880014, "clip_ratio/low_mean": 0.0041654159722384065, "clip_ratio/low_min": 0.0041654159722384065, "clip_ratio/region_mean": 0.007945781399030238, "completions/clipped_ratio": 0.0, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1879.0, "completions/mean_terminated_length": 1879.0, "completions/min_length": 1778.0, "completions/min_terminated_length": 1778.0, "entropy": 0.036550283432006836, "epoch": 2.9080001163200046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00437904940918088, "kl": 0.014015332912094891, "learning_rate": 7.462979997362272e-06, "loss": 0.0001, "num_tokens": 18404879.0, "reward": 3.654906749725342, "reward_std": 10.768372535705566, "rewards/rollout_reward_func/mean": 3.654906749725342, "rewards/rollout_reward_func/std": 10.768373489379883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.306640625, "sampling/sampling_logp_difference/mean": 0.27816933393478394, "step": 727, "step_time": 28.441445800999645 }, { "clip_ratio/high_max": 0.0034191739396192133, "clip_ratio/high_mean": 0.0034191739396192133, "clip_ratio/low_mean": 0.004686129483161494, "clip_ratio/low_min": 0.004686129483161494, "clip_ratio/region_mean": 0.008105303219053894, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1997.0625, "completions/mean_terminated_length": 1997.0625, "completions/min_length": 1891.0, "completions/min_terminated_length": 1891.0, "entropy": 0.033648707903921604, "epoch": 2.9120001164800047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030074897222220898, "kl": 0.01275574485771358, "learning_rate": 7.462979997354632e-06, "loss": 0.0001, "num_tokens": 18449770.0, "reward": -2.621155261993408, "reward_std": 13.49407958984375, "rewards/rollout_reward_func/mean": -2.621155261993408, "rewards/rollout_reward_func/std": 13.494081497192383, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 61.2725830078125, "sampling/sampling_logp_difference/mean": 0.27457404136657715, "step": 728, "step_time": 29.40106292998098 }, { "clip_ratio/high_max": 0.004790445120306686, "clip_ratio/high_mean": 0.004790445120306686, "clip_ratio/low_mean": 0.0039646119403187186, "clip_ratio/low_min": 0.0039646119403187186, "clip_ratio/region_mean": 0.008755057118833065, "completions/clipped_ratio": 0.0, "completions/max_length": 2036.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1920.625, "completions/mean_terminated_length": 1920.625, "completions/min_length": 1792.0, "completions/min_terminated_length": 1792.0, "entropy": 0.035473385360091925, "epoch": 2.9160001166400047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009076962247490883, "kl": 0.014217275893315673, "learning_rate": 7.46297999734698e-06, "loss": 0.0001, "num_tokens": 18493391.0, "reward": -4.872392654418945, "reward_std": 11.653722763061523, "rewards/rollout_reward_func/mean": -4.872392654418945, "rewards/rollout_reward_func/std": 11.653722763061523, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.25, "sampling/sampling_logp_difference/mean": 0.28915926814079285, "step": 729, "step_time": 28.762672455995926 }, { "clip_ratio/high_max": 0.00441777307423763, "clip_ratio/high_mean": 0.00441777307423763, "clip_ratio/low_mean": 0.004274237056961283, "clip_ratio/low_min": 0.004274237056961283, "clip_ratio/region_mean": 0.008692010247614235, "completions/clipped_ratio": 0.0, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1882.875, "completions/mean_terminated_length": 1882.875, "completions/min_length": 1785.0, "completions/min_terminated_length": 1785.0, "entropy": 0.03653035406023264, "epoch": 2.9200001168000048e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029935059137642384, "kl": 0.012841573101468384, "learning_rate": 7.4629799973393186e-06, "loss": 0.0001, "num_tokens": 18536393.0, "reward": 2.644758462905884, "reward_std": 13.194252967834473, "rewards/rollout_reward_func/mean": 2.644758462905884, "rewards/rollout_reward_func/std": 13.194252967834473, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.4375, "sampling/sampling_logp_difference/mean": 0.28799283504486084, "step": 730, "step_time": 28.788456882008177 }, { "clip_ratio/high_max": 0.004630840441677719, "clip_ratio/high_mean": 0.004630840441677719, "clip_ratio/low_mean": 0.003747819544514641, "clip_ratio/low_min": 0.003747819544514641, "clip_ratio/region_mean": 0.00837865995708853, "completions/clipped_ratio": 0.0, "completions/max_length": 2067.0, "completions/max_terminated_length": 2067.0, "completions/mean_length": 1915.25, "completions/mean_terminated_length": 1915.25, "completions/min_length": 1800.0, "completions/min_terminated_length": 1800.0, "entropy": 0.03577035665512085, "epoch": 2.9240001169600046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006531745195388794, "kl": 0.016976338694803417, "learning_rate": 7.4629799973316444e-06, "loss": 0.0002, "num_tokens": 18579933.0, "reward": -0.30140745639801025, "reward_std": 11.761985778808594, "rewards/rollout_reward_func/mean": -0.30140745639801025, "rewards/rollout_reward_func/std": 11.761985778808594, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.78125, "sampling/sampling_logp_difference/mean": 0.2876862585544586, "step": 731, "step_time": 29.40052677300264 }, { "clip_ratio/high_max": 0.0038658022531308234, "clip_ratio/high_mean": 0.0038658022531308234, "clip_ratio/low_mean": 0.004604321875376627, "clip_ratio/low_min": 0.004604321875376627, "clip_ratio/region_mean": 0.00847012409940362, "completions/clipped_ratio": 0.0, "completions/max_length": 2032.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1956.9375, "completions/mean_terminated_length": 1956.9375, "completions/min_length": 1855.0, "completions/min_terminated_length": 1855.0, "entropy": 0.034282560693100095, "epoch": 2.9280001171200046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036963566672056913, "kl": 0.012378850195091218, "learning_rate": 7.462979997323961e-06, "loss": 0.0001, "num_tokens": 18624162.0, "reward": -3.4124882221221924, "reward_std": 12.42409610748291, "rewards/rollout_reward_func/mean": -3.4124882221221924, "rewards/rollout_reward_func/std": 12.424097061157227, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.10938262939453, "sampling/sampling_logp_difference/mean": 0.27286210656166077, "step": 732, "step_time": 28.922902902988426 }, { "clip_ratio/high_max": 0.004617665457772091, "clip_ratio/high_mean": 0.004617665457772091, "clip_ratio/low_mean": 0.003944091469747946, "clip_ratio/low_min": 0.003944091469747946, "clip_ratio/region_mean": 0.008561756985727698, "completions/clipped_ratio": 0.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1957.1875, "completions/mean_terminated_length": 1957.1875, "completions/min_length": 1831.0, "completions/min_terminated_length": 1831.0, "entropy": 0.03334492351859808, "epoch": 2.9320001172800047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0020142088178545237, "kl": 0.011004143743775785, "learning_rate": 7.4629799973162666e-06, "loss": 0.0001, "num_tokens": 18668385.0, "reward": -3.112412929534912, "reward_std": 12.661928176879883, "rewards/rollout_reward_func/mean": -3.112412929534912, "rewards/rollout_reward_func/std": 12.661928176879883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.125, "sampling/sampling_logp_difference/mean": 0.2813401222229004, "step": 733, "step_time": 29.09005595400231 }, { "clip_ratio/high_max": 0.0038291745877359062, "clip_ratio/high_mean": 0.0038291745877359062, "clip_ratio/low_mean": 0.005315172980772331, "clip_ratio/low_min": 0.005315172980772331, "clip_ratio/region_mean": 0.009144347510300577, "completions/clipped_ratio": 0.0, "completions/max_length": 1981.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 1897.9375, "completions/mean_terminated_length": 1897.9375, "completions/min_length": 1810.0, "completions/min_terminated_length": 1810.0, "entropy": 0.038406758569180965, "epoch": 2.9360001174400048e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.023678304627537727, "kl": 0.01713961863424629, "learning_rate": 7.462979997308559e-06, "loss": 0.0002, "num_tokens": 18711635.0, "reward": -2.0771608352661133, "reward_std": 8.26648235321045, "rewards/rollout_reward_func/mean": -2.0771608352661133, "rewards/rollout_reward_func/std": 8.266483306884766, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.3127326965332, "sampling/sampling_logp_difference/mean": 0.2849014699459076, "step": 734, "step_time": 28.793376108013035 }, { "clip_ratio/high_max": 0.0036403685080586, "clip_ratio/high_mean": 0.0036403685080586, "clip_ratio/low_mean": 0.005389606289099902, "clip_ratio/low_min": 0.005389606289099902, "clip_ratio/region_mean": 0.009029974753502756, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1748.5, "completions/mean_terminated_length": 1748.5, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.037139042280614376, "epoch": 2.9400001176000045e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002622690051794052, "kl": 0.010516594222281128, "learning_rate": 7.462979997300842e-06, "loss": 0.0001, "num_tokens": 18752540.0, "reward": 7.108323574066162, "reward_std": 31.393415451049805, "rewards/rollout_reward_func/mean": 7.108323574066162, "rewards/rollout_reward_func/std": 31.393417358398438, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.843788146972656, "sampling/sampling_logp_difference/mean": 0.2741728723049164, "step": 735, "step_time": 27.968796791996283 }, { "clip_ratio/high_max": 0.0038121721881907433, "clip_ratio/high_mean": 0.0038121721881907433, "clip_ratio/low_mean": 0.004523910844000056, "clip_ratio/low_min": 0.004523910844000056, "clip_ratio/region_mean": 0.008336082944879308, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1922.25, "completions/mean_terminated_length": 1922.25, "completions/min_length": 1817.0, "completions/min_terminated_length": 1817.0, "entropy": 0.034930315567180514, "epoch": 2.9440001177600046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01992529258131981, "kl": 0.01743913785321638, "learning_rate": 7.462979997293113e-06, "loss": 0.0002, "num_tokens": 18796205.0, "reward": -0.05253791809082031, "reward_std": 10.578097343444824, "rewards/rollout_reward_func/mean": -0.05253791809082031, "rewards/rollout_reward_func/std": 10.578097343444824, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.484378814697266, "sampling/sampling_logp_difference/mean": 0.2802004814147949, "step": 736, "step_time": 29.173778014017444 }, { "clip_ratio/high_max": 0.005658719863276929, "clip_ratio/high_mean": 0.005658719863276929, "clip_ratio/low_mean": 0.003352265397552401, "clip_ratio/low_min": 0.003352265397552401, "clip_ratio/region_mean": 0.00901098531903699, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1888.875, "completions/mean_terminated_length": 1888.875, "completions/min_length": 1806.0, "completions/min_terminated_length": 1806.0, "entropy": 0.035430626943707466, "epoch": 2.9480001179200047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004433515947312117, "kl": 0.015257337945513427, "learning_rate": 7.462979997285374e-06, "loss": 0.0001, "num_tokens": 18839293.0, "reward": -5.629010200500488, "reward_std": 7.92061710357666, "rewards/rollout_reward_func/mean": -5.629010200500488, "rewards/rollout_reward_func/std": 7.920617580413818, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.125, "sampling/sampling_logp_difference/mean": 0.28066369891166687, "step": 737, "step_time": 28.86735647000023 }, { "clip_ratio/high_max": 0.0032528511655982584, "clip_ratio/high_mean": 0.0032528511655982584, "clip_ratio/low_mean": 0.005583025660598651, "clip_ratio/low_min": 0.005583025660598651, "clip_ratio/region_mean": 0.008835876826196909, "completions/clipped_ratio": 0.0, "completions/max_length": 2062.0, "completions/max_terminated_length": 2062.0, "completions/mean_length": 1973.5625, "completions/mean_terminated_length": 1973.5625, "completions/min_length": 1801.0, "completions/min_terminated_length": 1801.0, "entropy": 0.032942735590040684, "epoch": 2.9520001180800048e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0023366124369204044, "kl": 0.012213818961754441, "learning_rate": 7.462979997277624e-06, "loss": 0.0001, "num_tokens": 18883799.0, "reward": 2.2272515296936035, "reward_std": 13.204140663146973, "rewards/rollout_reward_func/mean": 2.2272515296936035, "rewards/rollout_reward_func/std": 13.204141616821289, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.859375, "sampling/sampling_logp_difference/mean": 0.2828938066959381, "step": 738, "step_time": 29.136995630004094 }, { "clip_ratio/high_max": 0.0039250829431694, "clip_ratio/high_mean": 0.0039250829431694, "clip_ratio/low_mean": 0.0041232141957152635, "clip_ratio/low_min": 0.0041232141957152635, "clip_ratio/region_mean": 0.008048297197092324, "completions/clipped_ratio": 0.0, "completions/max_length": 2057.0, "completions/max_terminated_length": 2057.0, "completions/mean_length": 1948.9375, "completions/mean_terminated_length": 1948.9375, "completions/min_length": 1804.0, "completions/min_terminated_length": 1804.0, "entropy": 0.03663674439303577, "epoch": 2.956000118240005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002422440331429243, "kl": 0.0134518007398583, "learning_rate": 7.462979997269863e-06, "loss": 0.0001, "num_tokens": 18927883.0, "reward": 0.08702588081359863, "reward_std": 13.095012664794922, "rewards/rollout_reward_func/mean": 0.08702588081359863, "rewards/rollout_reward_func/std": 13.095013618469238, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.15625, "sampling/sampling_logp_difference/mean": 0.2825336456298828, "step": 739, "step_time": 29.262015562009765 }, { "clip_ratio/high_max": 0.0038201690185815096, "clip_ratio/high_mean": 0.0038201690185815096, "clip_ratio/low_mean": 0.005045428843004629, "clip_ratio/low_min": 0.005045428843004629, "clip_ratio/region_mean": 0.008865598007105291, "completions/clipped_ratio": 0.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1930.3125, "completions/mean_terminated_length": 1930.3125, "completions/min_length": 1823.0, "completions/min_terminated_length": 1823.0, "entropy": 0.0344460210762918, "epoch": 2.9600001184000046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028849435038864613, "kl": 0.013372539659030735, "learning_rate": 7.4629799972620895e-06, "loss": 0.0001, "num_tokens": 18971654.0, "reward": 1.558312177658081, "reward_std": 10.296650886535645, "rewards/rollout_reward_func/mean": 1.558312177658081, "rewards/rollout_reward_func/std": 10.296651840209961, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.390625, "sampling/sampling_logp_difference/mean": 0.28305527567863464, "step": 740, "step_time": 28.848438866989454 }, { "clip_ratio/high_max": 0.003373025276232511, "clip_ratio/high_mean": 0.003373025276232511, "clip_ratio/low_mean": 0.005989031575154513, "clip_ratio/low_min": 0.005989031575154513, "clip_ratio/region_mean": 0.009362056793179363, "completions/clipped_ratio": 0.0, "completions/max_length": 2068.0, "completions/max_terminated_length": 2068.0, "completions/mean_length": 1835.875, "completions/mean_terminated_length": 1835.875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.035959812346845865, "epoch": 2.9640001185600047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007716757711023092, "kl": 0.015139898518100381, "learning_rate": 7.462979997254305e-06, "loss": 0.0001, "num_tokens": 19013951.0, "reward": 7.06867790222168, "reward_std": 26.611793518066406, "rewards/rollout_reward_func/mean": 7.06867790222168, "rewards/rollout_reward_func/std": 26.611793518066406, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.40625, "sampling/sampling_logp_difference/mean": 0.28675469756126404, "step": 741, "step_time": 28.824203038981068 }, { "clip_ratio/high_max": 0.003925288037862629, "clip_ratio/high_mean": 0.003925288037862629, "clip_ratio/low_mean": 0.004855352599406615, "clip_ratio/low_min": 0.004855352599406615, "clip_ratio/region_mean": 0.008780640608165413, "completions/clipped_ratio": 0.0, "completions/max_length": 2058.0, "completions/max_terminated_length": 2058.0, "completions/mean_length": 1960.1875, "completions/mean_terminated_length": 1960.1875, "completions/min_length": 1813.0, "completions/min_terminated_length": 1813.0, "entropy": 0.03391368337906897, "epoch": 2.9680001187200048e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004062481224536896, "kl": 0.013473406317643821, "learning_rate": 7.46297999724651e-06, "loss": 0.0001, "num_tokens": 19058211.0, "reward": -0.2691267728805542, "reward_std": 10.410741806030273, "rewards/rollout_reward_func/mean": -0.2691267728805542, "rewards/rollout_reward_func/std": 10.410741806030273, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 38.562503814697266, "sampling/sampling_logp_difference/mean": 0.2749696671962738, "step": 742, "step_time": 29.318443356009084 }, { "clip_ratio/high_max": 0.004779450304340571, "clip_ratio/high_mean": 0.004779450304340571, "clip_ratio/low_mean": 0.003916976653272286, "clip_ratio/low_min": 0.003916976653272286, "clip_ratio/region_mean": 0.008696426928509027, "completions/clipped_ratio": 0.0, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 1978.4375, "completions/mean_terminated_length": 1978.4375, "completions/min_length": 1702.0, "completions/min_terminated_length": 1702.0, "entropy": 0.03459855867549777, "epoch": 2.972000118880005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00337544665671885, "kl": 0.013167561439331621, "learning_rate": 7.4629799972387054e-06, "loss": 0.0001, "num_tokens": 19102791.0, "reward": -0.0020818710327148438, "reward_std": 16.220748901367188, "rewards/rollout_reward_func/mean": -0.0020818710327148438, "rewards/rollout_reward_func/std": 16.220748901367188, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.71875, "sampling/sampling_logp_difference/mean": 0.2865678369998932, "step": 743, "step_time": 29.226720280988957 }, { "clip_ratio/high_max": 0.005449011223390698, "clip_ratio/high_mean": 0.005449011223390698, "clip_ratio/low_mean": 0.004047343565616757, "clip_ratio/low_min": 0.004047343565616757, "clip_ratio/region_mean": 0.009496354730799794, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1945.5, "completions/mean_terminated_length": 1945.5, "completions/min_length": 1844.0, "completions/min_terminated_length": 1844.0, "entropy": 0.03485848242416978, "epoch": 2.9760001190400046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006171096581965685, "kl": 0.01526868634391576, "learning_rate": 7.462979997230889e-06, "loss": 0.0001, "num_tokens": 19146837.0, "reward": -4.426651954650879, "reward_std": 11.428723335266113, "rewards/rollout_reward_func/mean": -4.426651954650879, "rewards/rollout_reward_func/std": 11.428723335266113, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.257904052734375, "sampling/sampling_logp_difference/mean": 0.2757047414779663, "step": 744, "step_time": 29.1006623590074 }, { "clip_ratio/high_max": 0.00454437910229899, "clip_ratio/high_mean": 0.00454437910229899, "clip_ratio/low_mean": 0.004544534836895764, "clip_ratio/low_min": 0.004544534836895764, "clip_ratio/region_mean": 0.009088913910090923, "completions/clipped_ratio": 0.0, "completions/max_length": 1982.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1892.9375, "completions/mean_terminated_length": 1892.9375, "completions/min_length": 1680.0, "completions/min_terminated_length": 1680.0, "entropy": 0.037031309213489294, "epoch": 2.9800001192000047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025998828932642937, "kl": 0.011539788800291717, "learning_rate": 7.462979997223061e-06, "loss": 0.0001, "num_tokens": 19190017.0, "reward": 4.78128719329834, "reward_std": 15.098912239074707, "rewards/rollout_reward_func/mean": 4.78128719329834, "rewards/rollout_reward_func/std": 15.098912239074707, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.5625, "sampling/sampling_logp_difference/mean": 0.29724225401878357, "step": 745, "step_time": 28.701621721003903 }, { "clip_ratio/high_max": 0.006143902603071183, "clip_ratio/high_mean": 0.006143902603071183, "clip_ratio/low_mean": 0.0031541835924144834, "clip_ratio/low_min": 0.0031541835924144834, "clip_ratio/region_mean": 0.009298086049966514, "completions/clipped_ratio": 0.0, "completions/max_length": 2036.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1896.3125, "completions/mean_terminated_length": 1896.3125, "completions/min_length": 1790.0, "completions/min_terminated_length": 1790.0, "entropy": 0.0360553958453238, "epoch": 2.9840001193600048e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.033102698624134064, "kl": 0.015891216229647398, "learning_rate": 7.462979997215221e-06, "loss": 0.0001, "num_tokens": 19233251.0, "reward": -0.7823976278305054, "reward_std": 11.012094497680664, "rewards/rollout_reward_func/mean": -0.7823976278305054, "rewards/rollout_reward_func/std": 11.012094497680664, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 62.44140625, "sampling/sampling_logp_difference/mean": 0.29300665855407715, "step": 746, "step_time": 28.451076176017523 }, { "clip_ratio/high_max": 0.003924142380128615, "clip_ratio/high_mean": 0.003924142380128615, "clip_ratio/low_mean": 0.005187802686123177, "clip_ratio/low_min": 0.005187802686123177, "clip_ratio/region_mean": 0.009111945109907538, "completions/clipped_ratio": 0.0, "completions/max_length": 1974.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1795.75, "completions/mean_terminated_length": 1795.75, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.037903085350990295, "epoch": 2.988000119520005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00517709506675601, "kl": 0.013664446421898901, "learning_rate": 7.462979997207371e-06, "loss": 0.0001, "num_tokens": 19274867.0, "reward": 5.357452392578125, "reward_std": 25.920333862304688, "rewards/rollout_reward_func/mean": 5.357452392578125, "rewards/rollout_reward_func/std": 25.92033576965332, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.84375, "sampling/sampling_logp_difference/mean": 0.2847157418727875, "step": 747, "step_time": 27.857738247999805 }, { "clip_ratio/high_max": 0.004884459456661716, "clip_ratio/high_mean": 0.004884459456661716, "clip_ratio/low_mean": 0.004165064194239676, "clip_ratio/low_min": 0.004165064194239676, "clip_ratio/region_mean": 0.009049523680005223, "completions/clipped_ratio": 0.0, "completions/max_length": 1961.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 1886.4375, "completions/mean_terminated_length": 1886.4375, "completions/min_length": 1764.0, "completions/min_terminated_length": 1764.0, "entropy": 0.035837415140122175, "epoch": 2.992000119680005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010587414726614952, "kl": 0.014903741306625307, "learning_rate": 7.462979997199511e-06, "loss": 0.0001, "num_tokens": 19317931.0, "reward": 4.257648468017578, "reward_std": 15.738558769226074, "rewards/rollout_reward_func/mean": 4.257648468017578, "rewards/rollout_reward_func/std": 15.738558769226074, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.1015625, "sampling/sampling_logp_difference/mean": 0.2941291332244873, "step": 748, "step_time": 28.547312630005763 }, { "clip_ratio/high_max": 0.0035526913416106254, "clip_ratio/high_mean": 0.0035526913416106254, "clip_ratio/low_mean": 0.005863008962478489, "clip_ratio/low_min": 0.005863008962478489, "clip_ratio/region_mean": 0.009415700391400605, "completions/clipped_ratio": 0.0, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 1931.1875, "completions/mean_terminated_length": 1931.1875, "completions/min_length": 1608.0, "completions/min_terminated_length": 1608.0, "entropy": 0.03743473789654672, "epoch": 2.9960001198400047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00887687224894762, "kl": 0.01588004140648991, "learning_rate": 7.462979997191638e-06, "loss": 0.0001, "num_tokens": 19361746.0, "reward": 4.028827667236328, "reward_std": 35.41598892211914, "rewards/rollout_reward_func/mean": 4.028827667236328, "rewards/rollout_reward_func/std": 35.415992736816406, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 61.51708984375, "sampling/sampling_logp_difference/mean": 0.29545778036117554, "step": 749, "step_time": 29.385847164987354 }, { "clip_ratio/high_max": 0.004409945162478834, "clip_ratio/high_mean": 0.004409945162478834, "clip_ratio/low_mean": 0.004491951607633382, "clip_ratio/low_min": 0.004491951607633382, "clip_ratio/region_mean": 0.008901896770112216, "completions/clipped_ratio": 0.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1865.1875, "completions/mean_terminated_length": 1865.1875, "completions/min_length": 1337.0, "completions/min_terminated_length": 1337.0, "entropy": 0.03732103342190385, "epoch": 3.0000001200000047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004834816325455904, "kl": 0.014156990684568882, "learning_rate": 7.462979997183755e-06, "loss": 0.0001, "num_tokens": 19404448.0, "reward": 6.8439249992370605, "reward_std": 31.148740768432617, "rewards/rollout_reward_func/mean": 6.8439249992370605, "rewards/rollout_reward_func/std": 31.14874267578125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.25, "sampling/sampling_logp_difference/mean": 0.3058558404445648, "step": 750, "step_time": 28.192822176017216 }, { "clip_ratio/high_max": 0.0020973049686290324, "clip_ratio/high_mean": 0.0020973049686290324, "clip_ratio/low_mean": 0.005672125611454248, "clip_ratio/low_min": 0.005672125611454248, "clip_ratio/region_mean": 0.007769430638290942, "completions/clipped_ratio": 0.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 1661.375, "completions/mean_terminated_length": 1661.375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.04103564517572522, "epoch": 3.0040001201600048e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010070884600281715, "kl": 0.013332935224752873, "learning_rate": 7.462979997175861e-06, "loss": 0.0001, "num_tokens": 19443885.0, "reward": 11.193130493164062, "reward_std": 34.255950927734375, "rewards/rollout_reward_func/mean": 11.193130493164062, "rewards/rollout_reward_func/std": 34.25594711303711, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.453125, "sampling/sampling_logp_difference/mean": 0.29688021540641785, "step": 751, "step_time": 26.163333541015163 }, { "clip_ratio/high_max": 0.005031389009673148, "clip_ratio/high_mean": 0.005031389009673148, "clip_ratio/low_mean": 0.003982215013820678, "clip_ratio/low_min": 0.003982215013820678, "clip_ratio/region_mean": 0.009013604139909148, "completions/clipped_ratio": 0.0, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1895.625, "completions/mean_terminated_length": 1895.625, "completions/min_length": 1826.0, "completions/min_terminated_length": 1826.0, "entropy": 0.03633048804476857, "epoch": 3.008000120320005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007163743022829294, "kl": 0.01685800717677921, "learning_rate": 7.462979997167956e-06, "loss": 0.0002, "num_tokens": 19487105.0, "reward": -1.1936428546905518, "reward_std": 10.96961498260498, "rewards/rollout_reward_func/mean": -1.1936428546905518, "rewards/rollout_reward_func/std": 10.96961498260498, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 59.6669921875, "sampling/sampling_logp_difference/mean": 0.29110100865364075, "step": 752, "step_time": 28.534852007986046 }, { "clip_ratio/high_max": 0.0043768965697381645, "clip_ratio/high_mean": 0.0043768965697381645, "clip_ratio/low_mean": 0.004567905038129538, "clip_ratio/low_min": 0.004567905038129538, "clip_ratio/region_mean": 0.008944801695179194, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1935.25, "completions/mean_terminated_length": 1935.25, "completions/min_length": 1818.0, "completions/min_terminated_length": 1818.0, "entropy": 0.03622446162626147, "epoch": 3.0120001204800047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0019954696763306856, "kl": 0.011604164959862828, "learning_rate": 7.46297999716004e-06, "loss": 0.0001, "num_tokens": 19530963.0, "reward": -6.122139930725098, "reward_std": 10.201497077941895, "rewards/rollout_reward_func/mean": -6.122139930725098, "rewards/rollout_reward_func/std": 10.201497077941895, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.625, "sampling/sampling_logp_difference/mean": 0.28053027391433716, "step": 753, "step_time": 29.194943660957506 }, { "clip_ratio/high_max": 0.004998085321858525, "clip_ratio/high_mean": 0.004998085321858525, "clip_ratio/low_mean": 0.003553898073732853, "clip_ratio/low_min": 0.003553898073732853, "clip_ratio/region_mean": 0.008551983337383717, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1833.0, "completions/mean_terminated_length": 1833.0, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.0347238564863801, "epoch": 3.0160001206400047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0065495590679347515, "kl": 0.012674775265622884, "learning_rate": 7.4629799971521124e-06, "loss": 0.0001, "num_tokens": 19573193.0, "reward": 13.96647834777832, "reward_std": 25.430025100708008, "rewards/rollout_reward_func/mean": 13.96647834777832, "rewards/rollout_reward_func/std": 25.430025100708008, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.656253814697266, "sampling/sampling_logp_difference/mean": 0.2862381339073181, "step": 754, "step_time": 28.304294039029628 }, { "clip_ratio/high_max": 0.005564942286582664, "clip_ratio/high_mean": 0.005564942286582664, "clip_ratio/low_mean": 0.003996271669166163, "clip_ratio/low_min": 0.003996271669166163, "clip_ratio/region_mean": 0.009561213955748826, "completions/clipped_ratio": 0.0, "completions/max_length": 1990.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1890.0625, "completions/mean_terminated_length": 1890.0625, "completions/min_length": 1600.0, "completions/min_terminated_length": 1600.0, "entropy": 0.037172339390963316, "epoch": 3.0200001208000048e-05, "frac_reward_zero_std": 0.0, "grad_norm": 11.524944305419922, "kl": 2.1360009519848973, "learning_rate": 7.462979997144174e-06, "loss": 0.018, "num_tokens": 19616325.0, "reward": 5.77150821685791, "reward_std": 36.61399459838867, "rewards/rollout_reward_func/mean": 5.77150821685791, "rewards/rollout_reward_func/std": 36.61399459838867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.09375, "sampling/sampling_logp_difference/mean": 0.2968870997428894, "step": 755, "step_time": 28.197812115962734 }, { "clip_ratio/high_max": 0.004465439269552007, "clip_ratio/high_mean": 0.004465439269552007, "clip_ratio/low_mean": 0.003121041358099319, "clip_ratio/low_min": 0.003121041358099319, "clip_ratio/region_mean": 0.007586480583995581, "completions/clipped_ratio": 0.0, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 1900.875, "completions/mean_terminated_length": 1900.875, "completions/min_length": 1821.0, "completions/min_terminated_length": 1821.0, "entropy": 0.036831192672252655, "epoch": 3.024000120960005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038826772943139076, "kl": 0.011483409674838185, "learning_rate": 7.462979997136225e-06, "loss": 0.0001, "num_tokens": 19659629.0, "reward": 2.807568311691284, "reward_std": 10.7330904006958, "rewards/rollout_reward_func/mean": 2.807568311691284, "rewards/rollout_reward_func/std": 10.733091354370117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.1875, "sampling/sampling_logp_difference/mean": 0.27657416462898254, "step": 756, "step_time": 29.023028301016893 }, { "clip_ratio/high_max": 0.004965514992363751, "clip_ratio/high_mean": 0.004965514992363751, "clip_ratio/low_mean": 0.005441643646918237, "clip_ratio/low_min": 0.005441643646918237, "clip_ratio/region_mean": 0.010407158581074327, "completions/clipped_ratio": 0.0, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1866.0625, "completions/mean_terminated_length": 1866.0625, "completions/min_length": 1564.0, "completions/min_terminated_length": 1564.0, "entropy": 0.037163855973631144, "epoch": 3.028000121120005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02062235213816166, "kl": 0.019511260092258453, "learning_rate": 7.462979997128264e-06, "loss": 0.0002, "num_tokens": 19702364.0, "reward": 7.169250011444092, "reward_std": 30.473342895507812, "rewards/rollout_reward_func/mean": 7.169250011444092, "rewards/rollout_reward_func/std": 30.473342895507812, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.28125, "sampling/sampling_logp_difference/mean": 0.2953627109527588, "step": 757, "step_time": 28.453093533986248 }, { "clip_ratio/high_max": 0.003862099605612457, "clip_ratio/high_mean": 0.003862099605612457, "clip_ratio/low_mean": 0.004470674466574565, "clip_ratio/low_min": 0.004470674466574565, "clip_ratio/region_mean": 0.008332774101290852, "completions/clipped_ratio": 0.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1973.125, "completions/mean_terminated_length": 1973.125, "completions/min_length": 1887.0, "completions/min_terminated_length": 1887.0, "entropy": 0.037625818978995085, "epoch": 3.0320001212800047e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005314708221703768, "kl": 0.015734399436041713, "learning_rate": 7.462979997120293e-06, "loss": 0.0002, "num_tokens": 19746861.0, "reward": -5.039022922515869, "reward_std": 10.760473251342773, "rewards/rollout_reward_func/mean": -5.039022922515869, "rewards/rollout_reward_func/std": 10.76047420501709, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.046875, "sampling/sampling_logp_difference/mean": 0.27095097303390503, "step": 758, "step_time": 28.70188202302961 }, { "clip_ratio/high_max": 0.004433649213751778, "clip_ratio/high_mean": 0.004433649213751778, "clip_ratio/low_mean": 0.00445594236953184, "clip_ratio/low_min": 0.00445594236953184, "clip_ratio/region_mean": 0.008889591554179788, "completions/clipped_ratio": 0.0, "completions/max_length": 2053.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 1977.6875, "completions/mean_terminated_length": 1977.6875, "completions/min_length": 1871.0, "completions/min_terminated_length": 1871.0, "entropy": 0.037705764174461365, "epoch": 3.0360001214400048e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006900593638420105, "kl": 0.019455632660537958, "learning_rate": 7.46297999711231e-06, "loss": 0.0002, "num_tokens": 19791449.0, "reward": 0.47892940044403076, "reward_std": 7.159378528594971, "rewards/rollout_reward_func/mean": 0.47892940044403076, "rewards/rollout_reward_func/std": 7.159379005432129, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.390625, "sampling/sampling_logp_difference/mean": 0.2693158984184265, "step": 759, "step_time": 28.99387705200934 }, { "clip_ratio/high_max": 0.004570173739921302, "clip_ratio/high_mean": 0.004570173739921302, "clip_ratio/low_mean": 0.002768499922240153, "clip_ratio/low_min": 0.002768499922240153, "clip_ratio/region_mean": 0.007338673691265285, "completions/clipped_ratio": 0.0, "completions/max_length": 2055.0, "completions/max_terminated_length": 2055.0, "completions/mean_length": 1967.5, "completions/mean_terminated_length": 1967.5, "completions/min_length": 1803.0, "completions/min_terminated_length": 1803.0, "entropy": 0.0362169467844069, "epoch": 3.040000121600005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00543265650048852, "kl": 0.01833712332881987, "learning_rate": 7.462979997104317e-06, "loss": 0.0002, "num_tokens": 19835838.0, "reward": -3.486788272857666, "reward_std": 12.348264694213867, "rewards/rollout_reward_func/mean": -3.486788272857666, "rewards/rollout_reward_func/std": 12.348264694213867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.062503814697266, "sampling/sampling_logp_difference/mean": 0.26122626662254333, "step": 760, "step_time": 29.13604462899093 }, { "clip_ratio/high_max": 0.004516046115895733, "clip_ratio/high_mean": 0.004516046115895733, "clip_ratio/low_mean": 0.003955612613935955, "clip_ratio/low_min": 0.003955612613935955, "clip_ratio/region_mean": 0.008471658627968282, "completions/clipped_ratio": 0.0, "completions/max_length": 2066.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 1911.5, "completions/mean_terminated_length": 1911.5, "completions/min_length": 1812.0, "completions/min_terminated_length": 1812.0, "entropy": 0.03786758007481694, "epoch": 3.044000121760005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02269195206463337, "kl": 0.025667303358204663, "learning_rate": 7.462979997096311e-06, "loss": 0.0002, "num_tokens": 19879325.0, "reward": -0.5908738374710083, "reward_std": 16.75385856628418, "rewards/rollout_reward_func/mean": -0.5908738374710083, "rewards/rollout_reward_func/std": 16.753860473632812, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.05484390258789, "sampling/sampling_logp_difference/mean": 0.2755439281463623, "step": 761, "step_time": 29.053918464953313 }, { "clip_ratio/high_max": 0.004235857981257141, "clip_ratio/high_mean": 0.004235857981257141, "clip_ratio/low_mean": 0.003764793509617448, "clip_ratio/low_min": 0.003764793509617448, "clip_ratio/region_mean": 0.008000651432666928, "completions/clipped_ratio": 0.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1809.0, "completions/mean_terminated_length": 1809.0, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.03794890455901623, "epoch": 3.048000121920005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.11843065172433853, "kl": 0.1025691544637084, "learning_rate": 7.462979997088296e-06, "loss": 0.001, "num_tokens": 19921161.0, "reward": 10.233598709106445, "reward_std": 27.192598342895508, "rewards/rollout_reward_func/mean": 10.233598709106445, "rewards/rollout_reward_func/std": 27.192602157592773, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.359378814697266, "sampling/sampling_logp_difference/mean": 0.27668678760528564, "step": 762, "step_time": 28.127315170990187 }, { "clip_ratio/high_max": 0.0026997801032848656, "clip_ratio/high_mean": 0.0026997801032848656, "clip_ratio/low_mean": 0.004937843681545928, "clip_ratio/low_min": 0.004937843681545928, "clip_ratio/region_mean": 0.007637623755726963, "completions/clipped_ratio": 0.0, "completions/max_length": 2069.0, "completions/max_terminated_length": 2069.0, "completions/mean_length": 1907.375, "completions/mean_terminated_length": 1907.375, "completions/min_length": 1333.0, "completions/min_terminated_length": 1333.0, "entropy": 0.04040715843439102, "epoch": 3.052000122080005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.06395705044269562, "kl": 0.04256788152270019, "learning_rate": 7.462979997080269e-06, "loss": 0.0004, "num_tokens": 19964596.0, "reward": 7.0285234451293945, "reward_std": 31.63996124267578, "rewards/rollout_reward_func/mean": 7.0285234451293945, "rewards/rollout_reward_func/std": 31.63996124267578, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.62110137939453, "sampling/sampling_logp_difference/mean": 0.27949708700180054, "step": 763, "step_time": 28.748426619000384 }, { "clip_ratio/high_max": 0.0036272835568524897, "clip_ratio/high_mean": 0.0036272835568524897, "clip_ratio/low_mean": 0.0040651959425304085, "clip_ratio/low_min": 0.0040651959425304085, "clip_ratio/region_mean": 0.007692479528486729, "completions/clipped_ratio": 0.0, "completions/max_length": 2031.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1957.8125, "completions/mean_terminated_length": 1957.8125, "completions/min_length": 1908.0, "completions/min_terminated_length": 1908.0, "entropy": 0.039766674395650625, "epoch": 3.056000122240005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.08193741738796234, "kl": 0.04074974195100367, "learning_rate": 7.462979997072231e-06, "loss": 0.0004, "num_tokens": 20008840.0, "reward": -2.0117383003234863, "reward_std": 10.460890769958496, "rewards/rollout_reward_func/mean": -2.0117383003234863, "rewards/rollout_reward_func/std": 10.460890769958496, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.0, "sampling/sampling_logp_difference/mean": 0.26247063279151917, "step": 764, "step_time": 28.38802648599085 }, { "clip_ratio/high_max": 0.0032285161141771823, "clip_ratio/high_mean": 0.0032285161141771823, "clip_ratio/low_mean": 0.00435502637992613, "clip_ratio/low_min": 0.00435502637992613, "clip_ratio/region_mean": 0.007583542610518634, "completions/clipped_ratio": 0.0, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1792.375, "completions/mean_terminated_length": 1792.375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.040998047683387995, "epoch": 3.0600001224000046e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.034191351383924484, "kl": 0.03984884335659444, "learning_rate": 7.462979997064183e-06, "loss": 0.0004, "num_tokens": 20050398.0, "reward": 5.922994613647461, "reward_std": 25.207094192504883, "rewards/rollout_reward_func/mean": 5.922994613647461, "rewards/rollout_reward_func/std": 25.207094192504883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.531341552734375, "sampling/sampling_logp_difference/mean": 0.27061620354652405, "step": 765, "step_time": 27.72865526903479 }, { "clip_ratio/high_max": 0.002762884832918644, "clip_ratio/high_mean": 0.002762884832918644, "clip_ratio/low_mean": 0.004781236813869327, "clip_ratio/low_min": 0.004781236813869327, "clip_ratio/region_mean": 0.007544121704995632, "completions/clipped_ratio": 0.0, "completions/max_length": 2034.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1933.4375, "completions/mean_terminated_length": 1933.4375, "completions/min_length": 1774.0, "completions/min_terminated_length": 1774.0, "entropy": 0.03816865710541606, "epoch": 3.064000122560005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.014105401001870632, "kl": 0.030387570150196552, "learning_rate": 7.462979997056122e-06, "loss": 0.0003, "num_tokens": 20094240.0, "reward": 0.027648568153381348, "reward_std": 11.771894454956055, "rewards/rollout_reward_func/mean": 0.027648568153381348, "rewards/rollout_reward_func/std": 11.771895408630371, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.968753814697266, "sampling/sampling_logp_difference/mean": 0.2597244679927826, "step": 766, "step_time": 28.22092310701555 }, { "clip_ratio/high_max": 0.0032084508275147527, "clip_ratio/high_mean": 0.0032084508275147527, "clip_ratio/low_mean": 0.00436853984137997, "clip_ratio/low_min": 0.00436853984137997, "clip_ratio/region_mean": 0.007576990639790893, "completions/clipped_ratio": 0.0, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 1880.8125, "completions/mean_terminated_length": 1880.8125, "completions/min_length": 1815.0, "completions/min_terminated_length": 1815.0, "entropy": 0.04066665377467871, "epoch": 3.068000122720005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.027666326612234116, "kl": 0.04137184610590339, "learning_rate": 7.462979997048053e-06, "loss": 0.0004, "num_tokens": 20137208.0, "reward": -0.5617131590843201, "reward_std": 16.191173553466797, "rewards/rollout_reward_func/mean": -0.5617131590843201, "rewards/rollout_reward_func/std": 16.191173553466797, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.96875, "sampling/sampling_logp_difference/mean": 0.26596391201019287, "step": 767, "step_time": 27.728443698011688 }, { "clip_ratio/high_max": 0.004767464939504862, "clip_ratio/high_mean": 0.004767464939504862, "clip_ratio/low_mean": 0.002623396983835846, "clip_ratio/low_min": 0.002623396983835846, "clip_ratio/region_mean": 0.007390862097963691, "completions/clipped_ratio": 0.0, "completions/max_length": 2070.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 1961.5625, "completions/mean_terminated_length": 1961.5625, "completions/min_length": 1835.0, "completions/min_terminated_length": 1835.0, "entropy": 0.04084271378815174, "epoch": 3.072000122880005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.41236716508865356, "kl": 0.028664823854342103, "learning_rate": 7.46297999703997e-06, "loss": 0.0003, "num_tokens": 20181511.0, "reward": 1.9612544775009155, "reward_std": 13.934028625488281, "rewards/rollout_reward_func/mean": 1.9612544775009155, "rewards/rollout_reward_func/std": 13.934030532836914, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.53125, "sampling/sampling_logp_difference/mean": 0.2553606331348419, "step": 768, "step_time": 28.863526406988967 }, { "clip_ratio/high_max": 0.00458884984254837, "clip_ratio/high_mean": 0.00458884984254837, "clip_ratio/low_mean": 0.003028582636034116, "clip_ratio/low_min": 0.003028582636034116, "clip_ratio/region_mean": 0.007617432449478656, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1899.6875, "completions/mean_terminated_length": 1899.6875, "completions/min_length": 1814.0, "completions/min_terminated_length": 1814.0, "entropy": 0.03772037336602807, "epoch": 3.076000123040005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006692246068269014, "kl": 0.0288119507022202, "learning_rate": 7.462979997031877e-06, "loss": 0.0003, "num_tokens": 20224790.0, "reward": -2.462620735168457, "reward_std": 10.54018783569336, "rewards/rollout_reward_func/mean": -2.462620735168457, "rewards/rollout_reward_func/std": 10.540188789367676, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.3125, "sampling/sampling_logp_difference/mean": 0.25956279039382935, "step": 769, "step_time": 28.63264077399799 }, { "clip_ratio/high_max": 0.004798228124855086, "clip_ratio/high_mean": 0.004798228124855086, "clip_ratio/low_mean": 0.003110584046225995, "clip_ratio/low_min": 0.003110584046225995, "clip_ratio/region_mean": 0.00790881214197725, "completions/clipped_ratio": 0.0, "completions/max_length": 2057.0, "completions/max_terminated_length": 2057.0, "completions/mean_length": 1965.0, "completions/mean_terminated_length": 1965.0, "completions/min_length": 1912.0, "completions/min_terminated_length": 1912.0, "entropy": 0.03827472636476159, "epoch": 3.080000123200005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010995591059327126, "kl": 0.03184981155209243, "learning_rate": 7.4629799970237725e-06, "loss": 0.0003, "num_tokens": 20269160.0, "reward": 2.816974639892578, "reward_std": 10.965965270996094, "rewards/rollout_reward_func/mean": 2.816974639892578, "rewards/rollout_reward_func/std": 10.965965270996094, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.8125, "sampling/sampling_logp_difference/mean": 0.25608888268470764, "step": 770, "step_time": 28.617739340974367 }, { "clip_ratio/high_max": 0.004149178537772968, "clip_ratio/high_mean": 0.004149178537772968, "clip_ratio/low_mean": 0.0037389032659120858, "clip_ratio/low_min": 0.0037389032659120858, "clip_ratio/region_mean": 0.007888081832788885, "completions/clipped_ratio": 0.0, "completions/max_length": 2053.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 1859.4375, "completions/mean_terminated_length": 1859.4375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "entropy": 0.03868004959076643, "epoch": 3.084000123360005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010000741109251976, "kl": 0.02933930978178978, "learning_rate": 7.462979997015658e-06, "loss": 0.0003, "num_tokens": 20311836.0, "reward": -0.32865583896636963, "reward_std": 24.134538650512695, "rewards/rollout_reward_func/mean": -0.32865583896636963, "rewards/rollout_reward_func/std": 24.134540557861328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.875, "sampling/sampling_logp_difference/mean": 0.2519981265068054, "step": 771, "step_time": 28.331116073022713 }, { "clip_ratio/high_max": 0.0028359812567941844, "clip_ratio/high_mean": 0.0028359812567941844, "clip_ratio/low_mean": 0.003662181319668889, "clip_ratio/low_min": 0.003662181319668889, "clip_ratio/region_mean": 0.006498162518255413, "completions/clipped_ratio": 0.0, "completions/max_length": 2070.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 1955.4375, "completions/mean_terminated_length": 1955.4375, "completions/min_length": 1820.0, "completions/min_terminated_length": 1820.0, "entropy": 0.03968909941613674, "epoch": 3.088000123520005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008007173426449299, "kl": 0.030290273716673255, "learning_rate": 7.4629799970075315e-06, "loss": 0.0003, "num_tokens": 20356026.0, "reward": 4.479229927062988, "reward_std": 10.357013702392578, "rewards/rollout_reward_func/mean": 4.479229927062988, "rewards/rollout_reward_func/std": 10.357013702392578, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.5, "sampling/sampling_logp_difference/mean": 0.2529526650905609, "step": 772, "step_time": 29.032997425005306 }, { "clip_ratio/high_max": 0.003973924583988264, "clip_ratio/high_mean": 0.003973924583988264, "clip_ratio/low_mean": 0.0032427505939267576, "clip_ratio/low_min": 0.0032427505939267576, "clip_ratio/region_mean": 0.007216675148811191, "completions/clipped_ratio": 0.0, "completions/max_length": 2081.0, "completions/max_terminated_length": 2081.0, "completions/mean_length": 1877.25, "completions/mean_terminated_length": 1877.25, "completions/min_length": 1583.0, "completions/min_terminated_length": 1583.0, "entropy": 0.04164747707545757, "epoch": 3.092000123680005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0683102011680603, "kl": 0.059777219546958804, "learning_rate": 7.462979996999394e-06, "loss": 0.0005, "num_tokens": 20398941.0, "reward": 6.847453594207764, "reward_std": 36.199371337890625, "rewards/rollout_reward_func/mean": 6.847453594207764, "rewards/rollout_reward_func/std": 36.19937515258789, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.0, "sampling/sampling_logp_difference/mean": 0.2645142078399658, "step": 773, "step_time": 28.54083020701364 }, { "clip_ratio/high_max": 0.0026750079705379903, "clip_ratio/high_mean": 0.0026750079705379903, "clip_ratio/low_mean": 0.004932021780405194, "clip_ratio/low_min": 0.004932021780405194, "clip_ratio/region_mean": 0.007607029750943184, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1916.5625, "completions/mean_terminated_length": 1916.5625, "completions/min_length": 1833.0, "completions/min_terminated_length": 1833.0, "entropy": 0.03924741502851248, "epoch": 3.096000123840005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010470736771821976, "kl": 0.027787520550191402, "learning_rate": 7.462979996991246e-06, "loss": 0.0003, "num_tokens": 20442491.0, "reward": 0.007430911064147949, "reward_std": 9.05749797821045, "rewards/rollout_reward_func/mean": 0.007430911064147949, "rewards/rollout_reward_func/std": 9.05749797821045, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.796875, "sampling/sampling_logp_difference/mean": 0.24911758303642273, "step": 774, "step_time": 27.92898362201231 }, { "clip_ratio/high_max": 0.00425168321817182, "clip_ratio/high_mean": 0.00425168321817182, "clip_ratio/low_mean": 0.0036684306105598807, "clip_ratio/low_min": 0.0036684306105598807, "clip_ratio/region_mean": 0.007920113916043192, "completions/clipped_ratio": 0.0, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 1956.125, "completions/mean_terminated_length": 1956.125, "completions/min_length": 1896.0, "completions/min_terminated_length": 1896.0, "entropy": 0.03988138446584344, "epoch": 3.100000124000005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005489266477525234, "kl": 0.026406634598970413, "learning_rate": 7.462979996983087e-06, "loss": 0.0003, "num_tokens": 20486704.0, "reward": -0.05892491340637207, "reward_std": 9.300389289855957, "rewards/rollout_reward_func/mean": -0.05892491340637207, "rewards/rollout_reward_func/std": 9.300389289855957, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.375, "sampling/sampling_logp_difference/mean": 0.24918772280216217, "step": 775, "step_time": 29.2058715679741 }, { "clip_ratio/high_max": 0.003314812609460205, "clip_ratio/high_mean": 0.003314812609460205, "clip_ratio/low_mean": 0.004476242495002225, "clip_ratio/low_min": 0.004476242495002225, "clip_ratio/region_mean": 0.0077910550171509385, "completions/clipped_ratio": 0.0, "completions/max_length": 2032.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1906.25, "completions/mean_terminated_length": 1906.25, "completions/min_length": 1809.0, "completions/min_terminated_length": 1809.0, "entropy": 0.038414273876696825, "epoch": 3.104000124160005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.019194364547729492, "kl": 0.031020433409139514, "learning_rate": 7.462979996974916e-06, "loss": 0.0003, "num_tokens": 20530080.0, "reward": 5.670014381408691, "reward_std": 18.58142852783203, "rewards/rollout_reward_func/mean": 5.670014381408691, "rewards/rollout_reward_func/std": 18.581430435180664, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.68806457519531, "sampling/sampling_logp_difference/mean": 0.2588516175746918, "step": 776, "step_time": 27.8165364000306 }, { "clip_ratio/high_max": 0.004332562617491931, "clip_ratio/high_mean": 0.004332562617491931, "clip_ratio/low_mean": 0.0030455468222498894, "clip_ratio/low_min": 0.0030455468222498894, "clip_ratio/region_mean": 0.007378109381534159, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1975.5625, "completions/mean_terminated_length": 1975.5625, "completions/min_length": 1807.0, "completions/min_terminated_length": 1807.0, "entropy": 0.04002506099641323, "epoch": 3.108000124320005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005388497840613127, "kl": 0.027978956466540694, "learning_rate": 7.462979996966735e-06, "loss": 0.0003, "num_tokens": 20574613.0, "reward": 3.957703113555908, "reward_std": 12.788880348205566, "rewards/rollout_reward_func/mean": 3.957703113555908, "rewards/rollout_reward_func/std": 12.788880348205566, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.95315170288086, "sampling/sampling_logp_difference/mean": 0.24464479088783264, "step": 777, "step_time": 28.45836379600223 }, { "clip_ratio/high_max": 0.00413465709425509, "clip_ratio/high_mean": 0.00413465709425509, "clip_ratio/low_mean": 0.003917167196050286, "clip_ratio/low_min": 0.003917167196050286, "clip_ratio/region_mean": 0.008051824173890054, "completions/clipped_ratio": 0.0, "completions/max_length": 2001.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1918.5, "completions/mean_terminated_length": 1918.5, "completions/min_length": 1842.0, "completions/min_terminated_length": 1842.0, "entropy": 0.03987689968198538, "epoch": 3.112000124480005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005617306102067232, "kl": 0.030246464535593987, "learning_rate": 7.4629799969585425e-06, "loss": 0.0003, "num_tokens": 20618199.0, "reward": -2.0551018714904785, "reward_std": 13.957444190979004, "rewards/rollout_reward_func/mean": -2.0551018714904785, "rewards/rollout_reward_func/std": 13.957444190979004, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.94140625, "sampling/sampling_logp_difference/mean": 0.2592725157737732, "step": 778, "step_time": 27.584683915003552 }, { "clip_ratio/high_max": 0.0030919716227799654, "clip_ratio/high_mean": 0.0030919716227799654, "clip_ratio/low_mean": 0.003860470067593269, "clip_ratio/low_min": 0.003860470067593269, "clip_ratio/region_mean": 0.006952441588509828, "completions/clipped_ratio": 0.0, "completions/max_length": 2066.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 2005.8125, "completions/mean_terminated_length": 2005.8125, "completions/min_length": 1899.0, "completions/min_terminated_length": 1899.0, "entropy": 0.03766894340515137, "epoch": 3.116000124640005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02885633334517479, "kl": 0.03431028197519481, "learning_rate": 7.462979996950339e-06, "loss": 0.0003, "num_tokens": 20663244.0, "reward": 4.354656219482422, "reward_std": 11.101053237915039, "rewards/rollout_reward_func/mean": 4.354656219482422, "rewards/rollout_reward_func/std": 11.101053237915039, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.5625, "sampling/sampling_logp_difference/mean": 0.24659350514411926, "step": 779, "step_time": 29.011225729016587 }, { "clip_ratio/high_max": 0.0027745783590944484, "clip_ratio/high_mean": 0.0027745783590944484, "clip_ratio/low_mean": 0.004590136813931167, "clip_ratio/low_min": 0.004590136813931167, "clip_ratio/region_mean": 0.007364715274889022, "completions/clipped_ratio": 0.0, "completions/max_length": 2061.0, "completions/max_terminated_length": 2061.0, "completions/mean_length": 1817.3125, "completions/mean_terminated_length": 1817.3125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "entropy": 0.03914410388097167, "epoch": 3.120000124800005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004812664818018675, "kl": 0.031397625571116805, "learning_rate": 7.462979996942124e-06, "loss": 0.0003, "num_tokens": 20705205.0, "reward": 0.894446849822998, "reward_std": 25.93561363220215, "rewards/rollout_reward_func/mean": 0.894446849822998, "rewards/rollout_reward_func/std": 25.93561363220215, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.4375, "sampling/sampling_logp_difference/mean": 0.26415717601776123, "step": 780, "step_time": 28.249442158994498 }, { "clip_ratio/high_max": 0.0034404159523546696, "clip_ratio/high_mean": 0.0034404159523546696, "clip_ratio/low_mean": 0.004531912243692204, "clip_ratio/low_min": 0.004531912243692204, "clip_ratio/region_mean": 0.007972328225150704, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1821.0625, "completions/mean_terminated_length": 1821.0625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.03928231168538332, "epoch": 3.124000124960005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010108555667102337, "kl": 0.03089741338044405, "learning_rate": 7.462979996933899e-06, "loss": 0.0003, "num_tokens": 20747248.0, "reward": 9.057743072509766, "reward_std": 24.468902587890625, "rewards/rollout_reward_func/mean": 9.057743072509766, "rewards/rollout_reward_func/std": 24.468904495239258, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.64844512939453, "sampling/sampling_logp_difference/mean": 0.26685765385627747, "step": 781, "step_time": 28.075369265003246 }, { "clip_ratio/high_max": 0.004170463216723874, "clip_ratio/high_mean": 0.004170463216723874, "clip_ratio/low_mean": 0.003924009972251952, "clip_ratio/low_min": 0.003924009972251952, "clip_ratio/region_mean": 0.008094473218079656, "completions/clipped_ratio": 0.0, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 1926.0, "completions/mean_terminated_length": 1926.0, "completions/min_length": 1220.0, "completions/min_terminated_length": 1220.0, "entropy": 0.03892250033095479, "epoch": 3.128000125120005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004704967141151428, "kl": 0.027873546816408634, "learning_rate": 7.462979996925662e-06, "loss": 0.0003, "num_tokens": 20790977.0, "reward": 11.642602920532227, "reward_std": 33.603858947753906, "rewards/rollout_reward_func/mean": 11.642602920532227, "rewards/rollout_reward_func/std": 33.60386276245117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.1875, "sampling/sampling_logp_difference/mean": 0.2691010534763336, "step": 782, "step_time": 32.85653840001032 }, { "clip_ratio/high_max": 0.005399830639362335, "clip_ratio/high_mean": 0.005399830639362335, "clip_ratio/low_mean": 0.0031215303170029074, "clip_ratio/low_min": 0.0031215303170029074, "clip_ratio/region_mean": 0.008521360927261412, "completions/clipped_ratio": 0.0, "completions/max_length": 2108.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 1986.5, "completions/mean_terminated_length": 1986.5, "completions/min_length": 1796.0, "completions/min_terminated_length": 1796.0, "entropy": 0.037062058225274086, "epoch": 3.132000125280005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005607115104794502, "kl": 0.02511440380476415, "learning_rate": 7.462979996917414e-06, "loss": 0.0002, "num_tokens": 20835697.0, "reward": 0.06770825386047363, "reward_std": 14.113324165344238, "rewards/rollout_reward_func/mean": 0.06770825386047363, "rewards/rollout_reward_func/std": 14.113324165344238, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.43214797973633, "sampling/sampling_logp_difference/mean": 0.2579394578933716, "step": 783, "step_time": 29.988888337029493 }, { "clip_ratio/high_max": 0.0030936716066207737, "clip_ratio/high_mean": 0.0030936716066207737, "clip_ratio/low_mean": 0.004266101313987747, "clip_ratio/low_min": 0.004266101313987747, "clip_ratio/region_mean": 0.0073597729206085205, "completions/clipped_ratio": 0.0, "completions/max_length": 2051.0, "completions/max_terminated_length": 2051.0, "completions/mean_length": 1971.9375, "completions/mean_terminated_length": 1971.9375, "completions/min_length": 1894.0, "completions/min_terminated_length": 1894.0, "entropy": 0.03897792939096689, "epoch": 3.136000125440005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.06071648374199867, "kl": 0.04675066261552274, "learning_rate": 7.4629799969091546e-06, "loss": 0.0005, "num_tokens": 20880179.0, "reward": -2.4510064125061035, "reward_std": 17.323436737060547, "rewards/rollout_reward_func/mean": -2.4510064125061035, "rewards/rollout_reward_func/std": 17.323436737060547, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.281280517578125, "sampling/sampling_logp_difference/mean": 0.2536378800868988, "step": 784, "step_time": 29.479143674005172 }, { "clip_ratio/high_max": 0.002651207396411337, "clip_ratio/high_mean": 0.002651207396411337, "clip_ratio/low_mean": 0.004547066113445908, "clip_ratio/low_min": 0.004547066113445908, "clip_ratio/region_mean": 0.007198273437097669, "completions/clipped_ratio": 0.0, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1825.75, "completions/mean_terminated_length": 1825.75, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.038130719447508454, "epoch": 3.140000125600005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0200339425355196, "kl": 0.03183295833878219, "learning_rate": 7.462979996900886e-06, "loss": 0.0003, "num_tokens": 20922274.0, "reward": 6.249594688415527, "reward_std": 25.996557235717773, "rewards/rollout_reward_func/mean": 6.249594688415527, "rewards/rollout_reward_func/std": 25.99655532836914, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.3125, "sampling/sampling_logp_difference/mean": 0.26093870401382446, "step": 785, "step_time": 27.93684872300946 }, { "clip_ratio/high_max": 0.004095767828403041, "clip_ratio/high_mean": 0.004095767828403041, "clip_ratio/low_mean": 0.0037191692390479147, "clip_ratio/low_min": 0.0037191692390479147, "clip_ratio/region_mean": 0.007814937038347125, "completions/clipped_ratio": 0.0, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1937.5, "completions/mean_terminated_length": 1937.5, "completions/min_length": 1838.0, "completions/min_terminated_length": 1838.0, "entropy": 0.03677333891391754, "epoch": 3.144000125760005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0049675097689032555, "kl": 0.02367501356638968, "learning_rate": 7.462979996892604e-06, "loss": 0.0002, "num_tokens": 20966196.0, "reward": -3.6426217555999756, "reward_std": 13.963443756103516, "rewards/rollout_reward_func/mean": -3.6426217555999756, "rewards/rollout_reward_func/std": 13.963445663452148, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.724609375, "sampling/sampling_logp_difference/mean": 0.26160189509391785, "step": 786, "step_time": 28.775380407998455 }, { "clip_ratio/high_max": 0.004914665158139542, "clip_ratio/high_mean": 0.004914665158139542, "clip_ratio/low_mean": 0.003084073803620413, "clip_ratio/low_min": 0.003084073803620413, "clip_ratio/region_mean": 0.007998738961759955, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1882.5, "completions/mean_terminated_length": 1882.5, "completions/min_length": 1816.0, "completions/min_terminated_length": 1816.0, "entropy": 0.04104022914543748, "epoch": 3.148000125920005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0066758678294718266, "kl": 0.025965968030504882, "learning_rate": 7.462979996884313e-06, "loss": 0.0002, "num_tokens": 21009191.0, "reward": -3.1996641159057617, "reward_std": 9.622440338134766, "rewards/rollout_reward_func/mean": -3.1996641159057617, "rewards/rollout_reward_func/std": 9.622440338134766, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.93761444091797, "sampling/sampling_logp_difference/mean": 0.2623329758644104, "step": 787, "step_time": 28.778146382974228 }, { "clip_ratio/high_max": 0.0030042683938518167, "clip_ratio/high_mean": 0.0030042683938518167, "clip_ratio/low_mean": 0.004477930837310851, "clip_ratio/low_min": 0.004477930837310851, "clip_ratio/region_mean": 0.007482199289370328, "completions/clipped_ratio": 0.0, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 1934.5, "completions/mean_terminated_length": 1934.5, "completions/min_length": 1820.0, "completions/min_terminated_length": 1820.0, "entropy": 0.037751815281808376, "epoch": 3.152000126080005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011425192467868328, "kl": 0.02779565774835646, "learning_rate": 7.46297999687601e-06, "loss": 0.0003, "num_tokens": 21053037.0, "reward": 3.362931728363037, "reward_std": 13.606873512268066, "rewards/rollout_reward_func/mean": 3.362931728363037, "rewards/rollout_reward_func/std": 13.606873512268066, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.5, "sampling/sampling_logp_difference/mean": 0.26819247007369995, "step": 788, "step_time": 29.303429138002684 }, { "clip_ratio/high_max": 0.0022607077480643056, "clip_ratio/high_mean": 0.0022607077480643056, "clip_ratio/low_mean": 0.004611950658727437, "clip_ratio/low_min": 0.004611950658727437, "clip_ratio/region_mean": 0.006872658443171531, "completions/clipped_ratio": 0.0, "completions/max_length": 2053.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 1837.3125, "completions/mean_terminated_length": 1837.3125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.036618510726839304, "epoch": 3.156000126240005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004819441121071577, "kl": 0.023722885875031352, "learning_rate": 7.462979996867696e-06, "loss": 0.0002, "num_tokens": 21095347.0, "reward": 8.198265075683594, "reward_std": 25.413908004760742, "rewards/rollout_reward_func/mean": 8.198265075683594, "rewards/rollout_reward_func/std": 25.413908004760742, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.0625, "sampling/sampling_logp_difference/mean": 0.2718784809112549, "step": 789, "step_time": 28.746498521009926 }, { "clip_ratio/high_max": 0.004046859277877957, "clip_ratio/high_mean": 0.004046859277877957, "clip_ratio/low_mean": 0.004240500973537564, "clip_ratio/low_min": 0.004240500973537564, "clip_ratio/region_mean": 0.008287360251415521, "completions/clipped_ratio": 0.0, "completions/max_length": 1984.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1883.75, "completions/mean_terminated_length": 1883.75, "completions/min_length": 1829.0, "completions/min_terminated_length": 1829.0, "entropy": 0.03947001043707132, "epoch": 3.160000126400005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008944780565798283, "kl": 0.025812425650656223, "learning_rate": 7.462979996859371e-06, "loss": 0.0002, "num_tokens": 21138353.0, "reward": -0.5686544179916382, "reward_std": 13.658865928649902, "rewards/rollout_reward_func/mean": -0.5686544179916382, "rewards/rollout_reward_func/std": 13.658865928649902, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.87500762939453, "sampling/sampling_logp_difference/mean": 0.2678675651550293, "step": 790, "step_time": 28.3212208479963 }, { "clip_ratio/high_max": 0.004298136191209778, "clip_ratio/high_mean": 0.004298136191209778, "clip_ratio/low_mean": 0.00330984074389562, "clip_ratio/low_min": 0.00330984074389562, "clip_ratio/region_mean": 0.007607976906001568, "completions/clipped_ratio": 0.0, "completions/max_length": 2024.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1940.0625, "completions/mean_terminated_length": 1940.0625, "completions/min_length": 1885.0, "completions/min_terminated_length": 1885.0, "entropy": 0.03710050415247679, "epoch": 3.1640001265600054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013101554475724697, "kl": 0.03011059807613492, "learning_rate": 7.4629799968510346e-06, "loss": 0.0003, "num_tokens": 21182303.0, "reward": -1.016033411026001, "reward_std": 13.518815994262695, "rewards/rollout_reward_func/mean": -1.016033411026001, "rewards/rollout_reward_func/std": 13.518815994262695, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.68262481689453, "sampling/sampling_logp_difference/mean": 0.25472813844680786, "step": 791, "step_time": 28.69306055402558 }, { "clip_ratio/high_max": 0.00395824498264119, "clip_ratio/high_mean": 0.00395824498264119, "clip_ratio/low_mean": 0.004228774836519733, "clip_ratio/low_min": 0.004228774836519733, "clip_ratio/region_mean": 0.008187019906472415, "completions/clipped_ratio": 0.0, "completions/max_length": 2053.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 1939.875, "completions/mean_terminated_length": 1939.875, "completions/min_length": 1828.0, "completions/min_terminated_length": 1828.0, "entropy": 0.03780687926337123, "epoch": 3.168000126720005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004629755858331919, "kl": 0.021559506189078093, "learning_rate": 7.462979996842687e-06, "loss": 0.0002, "num_tokens": 21226252.0, "reward": -4.421224117279053, "reward_std": 14.61473560333252, "rewards/rollout_reward_func/mean": -4.421224117279053, "rewards/rollout_reward_func/std": 14.61473560333252, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.6875, "sampling/sampling_logp_difference/mean": 0.26353368163108826, "step": 792, "step_time": 29.145990045013605 }, { "clip_ratio/high_max": 0.004074756987392902, "clip_ratio/high_mean": 0.004074756987392902, "clip_ratio/low_mean": 0.0037151151336729527, "clip_ratio/low_min": 0.0037151151336729527, "clip_ratio/region_mean": 0.007789872121065855, "completions/clipped_ratio": 0.0, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 1995.8125, "completions/mean_terminated_length": 1995.8125, "completions/min_length": 1877.0, "completions/min_terminated_length": 1877.0, "entropy": 0.03588790283538401, "epoch": 3.172000126880005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005817830096930265, "kl": 0.022019493393599987, "learning_rate": 7.462979996834329e-06, "loss": 0.0002, "num_tokens": 21271119.0, "reward": -0.6557714343070984, "reward_std": 9.649884223937988, "rewards/rollout_reward_func/mean": -0.6557714343070984, "rewards/rollout_reward_func/std": 9.649885177612305, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.90625, "sampling/sampling_logp_difference/mean": 0.2566322088241577, "step": 793, "step_time": 29.36034873504832 }, { "clip_ratio/high_max": 0.003955923719331622, "clip_ratio/high_mean": 0.003955923719331622, "clip_ratio/low_mean": 0.003765687142731622, "clip_ratio/low_min": 0.003765687142731622, "clip_ratio/region_mean": 0.007721610716544092, "completions/clipped_ratio": 0.0, "completions/max_length": 1963.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1882.875, "completions/mean_terminated_length": 1882.875, "completions/min_length": 1820.0, "completions/min_terminated_length": 1820.0, "entropy": 0.03656627703458071, "epoch": 3.176000127040005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004930180497467518, "kl": 0.021401226986199617, "learning_rate": 7.46297999682596e-06, "loss": 0.0002, "num_tokens": 21314126.0, "reward": 2.473022222518921, "reward_std": 11.451070785522461, "rewards/rollout_reward_func/mean": 2.473022222518921, "rewards/rollout_reward_func/std": 11.451070785522461, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.21875, "sampling/sampling_logp_difference/mean": 0.2677896022796631, "step": 794, "step_time": 28.530281451021438 }, { "clip_ratio/high_max": 0.0036781039088964462, "clip_ratio/high_mean": 0.0036781039088964462, "clip_ratio/low_mean": 0.0045760660723317415, "clip_ratio/low_min": 0.0045760660723317415, "clip_ratio/region_mean": 0.008254169835709035, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1911.875, "completions/mean_terminated_length": 1911.875, "completions/min_length": 1368.0, "completions/min_terminated_length": 1368.0, "entropy": 0.03870323998853564, "epoch": 3.180000127200005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005312665365636349, "kl": 0.02334282617084682, "learning_rate": 7.462979996817579e-06, "loss": 0.0002, "num_tokens": 21357619.0, "reward": 8.815532684326172, "reward_std": 37.27708053588867, "rewards/rollout_reward_func/mean": 8.815532684326172, "rewards/rollout_reward_func/std": 37.27708435058594, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.07814025878906, "sampling/sampling_logp_difference/mean": 0.27728402614593506, "step": 795, "step_time": 29.33383031799167 }, { "clip_ratio/high_max": 0.004127452266402543, "clip_ratio/high_mean": 0.004127452266402543, "clip_ratio/low_mean": 0.004529537516646087, "clip_ratio/low_min": 0.004529537516646087, "clip_ratio/region_mean": 0.008656989899463952, "completions/clipped_ratio": 0.0, "completions/max_length": 1961.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 1785.1875, "completions/mean_terminated_length": 1785.1875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "entropy": 0.03696325607597828, "epoch": 3.1840001273600054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012702051550149918, "kl": 0.026028907159343362, "learning_rate": 7.462979996809189e-06, "loss": 0.0002, "num_tokens": 21399051.0, "reward": 5.532665729522705, "reward_std": 25.062124252319336, "rewards/rollout_reward_func/mean": 5.532665729522705, "rewards/rollout_reward_func/std": 25.06212043762207, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.125, "sampling/sampling_logp_difference/mean": 0.27072668075561523, "step": 796, "step_time": 27.93333574999997 }, { "clip_ratio/high_max": 0.002060833794530481, "clip_ratio/high_mean": 0.002060833794530481, "clip_ratio/low_mean": 0.004916674442938529, "clip_ratio/low_min": 0.004916674442938529, "clip_ratio/region_mean": 0.006977508193813264, "completions/clipped_ratio": 0.0, "completions/max_length": 2019.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1818.5, "completions/mean_terminated_length": 1818.5, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.03960428200662136, "epoch": 3.188000127520005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.016662772744894028, "kl": 0.02113986515905708, "learning_rate": 7.462979996800786e-06, "loss": 0.0002, "num_tokens": 21441040.0, "reward": 3.5286242961883545, "reward_std": 25.254018783569336, "rewards/rollout_reward_func/mean": 3.5286242961883545, "rewards/rollout_reward_func/std": 25.254016876220703, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.8125, "sampling/sampling_logp_difference/mean": 0.2718694508075714, "step": 797, "step_time": 28.17594191498938 }, { "clip_ratio/high_max": 0.003000249183969572, "clip_ratio/high_mean": 0.003000249183969572, "clip_ratio/low_mean": 0.005541825870750472, "clip_ratio/low_min": 0.005541825870750472, "clip_ratio/region_mean": 0.008542075112927705, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1837.0, "completions/mean_terminated_length": 1837.0, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.036448636557906866, "epoch": 3.192000127680005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010132852010428905, "kl": 0.023515291279181838, "learning_rate": 7.4629799967923724e-06, "loss": 0.0002, "num_tokens": 21483347.0, "reward": 12.168495178222656, "reward_std": 28.43983268737793, "rewards/rollout_reward_func/mean": 12.168495178222656, "rewards/rollout_reward_func/std": 28.43983268737793, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.516693115234375, "sampling/sampling_logp_difference/mean": 0.2740578055381775, "step": 798, "step_time": 28.55274123797426 }, { "clip_ratio/high_max": 0.0026730525714810938, "clip_ratio/high_mean": 0.0026730525714810938, "clip_ratio/low_mean": 0.005616456619463861, "clip_ratio/low_min": 0.005616456619463861, "clip_ratio/region_mean": 0.008289509220048785, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1903.5625, "completions/mean_terminated_length": 1903.5625, "completions/min_length": 1770.0, "completions/min_terminated_length": 1770.0, "entropy": 0.03652750886976719, "epoch": 3.196000127840005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008652263320982456, "kl": 0.02170403767377138, "learning_rate": 7.462979996783949e-06, "loss": 0.0002, "num_tokens": 21526701.0, "reward": 8.227165222167969, "reward_std": 24.93907928466797, "rewards/rollout_reward_func/mean": 8.227165222167969, "rewards/rollout_reward_func/std": 24.93907928466797, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.5625, "sampling/sampling_logp_difference/mean": 0.2839777171611786, "step": 799, "step_time": 28.34674277699378 }, { "clip_ratio/high_max": 0.0032096303766593337, "clip_ratio/high_mean": 0.0032096303766593337, "clip_ratio/low_mean": 0.005213000375078991, "clip_ratio/low_min": 0.005213000375078991, "clip_ratio/region_mean": 0.008422630780842155, "completions/clipped_ratio": 0.0, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 2011.0, "completions/mean_terminated_length": 2011.0, "completions/min_length": 1926.0, "completions/min_terminated_length": 1926.0, "entropy": 0.03459028550423682, "epoch": 3.200000128000005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.033814407885074615, "kl": 0.02996477240230888, "learning_rate": 7.462979996775513e-06, "loss": 0.0003, "num_tokens": 21571812.0, "reward": 2.468040943145752, "reward_std": 15.786785125732422, "rewards/rollout_reward_func/mean": 2.468040943145752, "rewards/rollout_reward_func/std": 15.786786079406738, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.9375, "sampling/sampling_logp_difference/mean": 0.25625357031822205, "step": 800, "step_time": 29.52609861899691 }, { "clip_ratio/high_max": 0.00346728932345286, "clip_ratio/high_mean": 0.00346728932345286, "clip_ratio/low_mean": 0.003926279692677781, "clip_ratio/low_min": 0.003926279692677781, "clip_ratio/region_mean": 0.0073935691034421325, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 2299.0, "completions/mean_terminated_length": 2299.0, "completions/min_length": 2206.0, "completions/min_terminated_length": 2206.0, "entropy": 0.03364088316448033, "epoch": 3.204000128160005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004841494373977184, "kl": 0.017316009965725243, "learning_rate": 7.462979996767066e-06, "loss": 0.0002, "num_tokens": 21621531.0, "reward": -1.675631046295166, "reward_std": 20.357492446899414, "rewards/rollout_reward_func/mean": -1.675631046295166, "rewards/rollout_reward_func/std": 20.357492446899414, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.59375, "sampling/sampling_logp_difference/mean": 0.25119197368621826, "step": 801, "step_time": 31.98037437298626 }, { "clip_ratio/high_max": 0.0038986410945653915, "clip_ratio/high_mean": 0.0038986410945653915, "clip_ratio/low_mean": 0.004574171616695821, "clip_ratio/low_min": 0.004574171616695821, "clip_ratio/region_mean": 0.008472812653053552, "completions/clipped_ratio": 0.0, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 2195.875, "completions/mean_terminated_length": 2195.875, "completions/min_length": 2092.0, "completions/min_terminated_length": 2092.0, "entropy": 0.0352574922144413, "epoch": 3.208000128320005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.017839856445789337, "kl": 0.027434322284534574, "learning_rate": 7.462979996758608e-06, "loss": 0.0003, "num_tokens": 21669554.0, "reward": -9.22063159942627, "reward_std": 7.288678169250488, "rewards/rollout_reward_func/mean": -9.22063159942627, "rewards/rollout_reward_func/std": 7.288678169250488, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.46875, "sampling/sampling_logp_difference/mean": 0.2539291977882385, "step": 802, "step_time": 31.740357044982375 }, { "clip_ratio/high_max": 0.0038672474620398134, "clip_ratio/high_mean": 0.0038672474620398134, "clip_ratio/low_mean": 0.004298627201933414, "clip_ratio/low_min": 0.004298627201933414, "clip_ratio/region_mean": 0.008165874576661736, "completions/clipped_ratio": 0.0, "completions/max_length": 2381.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 2235.1875, "completions/mean_terminated_length": 2235.1875, "completions/min_length": 2121.0, "completions/min_terminated_length": 2121.0, "entropy": 0.0337415742687881, "epoch": 3.212000128480005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005204683635383844, "kl": 0.019442236400209367, "learning_rate": 7.46297999675014e-06, "loss": 0.0002, "num_tokens": 21718224.0, "reward": -6.659482479095459, "reward_std": 7.776278972625732, "rewards/rollout_reward_func/mean": -6.659482479095459, "rewards/rollout_reward_func/std": 7.776278972625732, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.328125, "sampling/sampling_logp_difference/mean": 0.2612137496471405, "step": 803, "step_time": 31.97513967299892 }, { "clip_ratio/high_max": 0.004174044297542423, "clip_ratio/high_mean": 0.004174044297542423, "clip_ratio/low_mean": 0.003336970985401422, "clip_ratio/low_min": 0.003336970985401422, "clip_ratio/region_mean": 0.007511015341151506, "completions/clipped_ratio": 0.0, "completions/max_length": 2275.0, "completions/max_terminated_length": 2275.0, "completions/mean_length": 2209.4375, "completions/mean_terminated_length": 2209.4375, "completions/min_length": 2106.0, "completions/min_terminated_length": 2106.0, "entropy": 0.03324771486222744, "epoch": 3.2160001286400054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010193011723458767, "kl": 0.02004599047359079, "learning_rate": 7.46297999674166e-06, "loss": 0.0002, "num_tokens": 21766462.0, "reward": -9.016164779663086, "reward_std": 10.755870819091797, "rewards/rollout_reward_func/mean": -9.016164779663086, "rewards/rollout_reward_func/std": 10.755871772766113, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.53125, "sampling/sampling_logp_difference/mean": 0.25394365191459656, "step": 804, "step_time": 31.0365832720272 }, { "clip_ratio/high_max": 0.00425061208079569, "clip_ratio/high_mean": 0.00425061208079569, "clip_ratio/low_mean": 0.003900126146618277, "clip_ratio/low_min": 0.003900126146618277, "clip_ratio/region_mean": 0.008150738256517798, "completions/clipped_ratio": 0.0, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 2161.125, "completions/mean_terminated_length": 2161.125, "completions/min_length": 2085.0, "completions/min_terminated_length": 2085.0, "entropy": 0.035863636527210474, "epoch": 3.220000128800005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.027693426236510277, "kl": 0.037828149856068194, "learning_rate": 7.462979996733169e-06, "loss": 0.0004, "num_tokens": 21813901.0, "reward": -9.135420799255371, "reward_std": 11.836008071899414, "rewards/rollout_reward_func/mean": -9.135420799255371, "rewards/rollout_reward_func/std": 11.83600902557373, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.921875, "sampling/sampling_logp_difference/mean": 0.26506465673446655, "step": 805, "step_time": 31.385593372993753 }, { "clip_ratio/high_max": 0.00370935897808522, "clip_ratio/high_mean": 0.00370935897808522, "clip_ratio/low_mean": 0.004134071583393961, "clip_ratio/low_min": 0.004134071583393961, "clip_ratio/region_mean": 0.00784343050327152, "completions/clipped_ratio": 0.0, "completions/max_length": 2411.0, "completions/max_terminated_length": 2411.0, "completions/mean_length": 2169.125, "completions/mean_terminated_length": 2169.125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.03214130294509232, "epoch": 3.224000128960005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003514291252940893, "kl": 0.017386552644893527, "learning_rate": 7.462979996724667e-06, "loss": 0.0002, "num_tokens": 21861527.0, "reward": -0.12519407272338867, "reward_std": 31.553462982177734, "rewards/rollout_reward_func/mean": -0.12519407272338867, "rewards/rollout_reward_func/std": 31.5534610748291, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.46875, "sampling/sampling_logp_difference/mean": 0.2530355751514435, "step": 806, "step_time": 31.50490097599686 }, { "clip_ratio/high_max": 0.0031088639807421714, "clip_ratio/high_mean": 0.0031088639807421714, "clip_ratio/low_mean": 0.004429622378665954, "clip_ratio/low_min": 0.004429622378665954, "clip_ratio/region_mean": 0.007538486330304295, "completions/clipped_ratio": 0.0, "completions/max_length": 2276.0, "completions/max_terminated_length": 2276.0, "completions/mean_length": 2186.25, "completions/mean_terminated_length": 2186.25, "completions/min_length": 2069.0, "completions/min_terminated_length": 2069.0, "entropy": 0.03408632706850767, "epoch": 3.228000129120005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00859898328781128, "kl": 0.020817477139644325, "learning_rate": 7.462979996716154e-06, "loss": 0.0002, "num_tokens": 21909385.0, "reward": -6.23066520690918, "reward_std": 13.626538276672363, "rewards/rollout_reward_func/mean": -6.23066520690918, "rewards/rollout_reward_func/std": 13.626537322998047, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.34375, "sampling/sampling_logp_difference/mean": 0.26575326919555664, "step": 807, "step_time": 31.252980174002005 }, { "clip_ratio/high_max": 0.004006068338640034, "clip_ratio/high_mean": 0.004006068338640034, "clip_ratio/low_mean": 0.003583208716008812, "clip_ratio/low_min": 0.003583208716008812, "clip_ratio/region_mean": 0.007589277112856507, "completions/clipped_ratio": 0.0, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 2230.375, "completions/mean_terminated_length": 2230.375, "completions/min_length": 2130.0, "completions/min_terminated_length": 2130.0, "entropy": 0.0330640624742955, "epoch": 3.232000129280005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004461915232241154, "kl": 0.018614951753988862, "learning_rate": 7.462979996707631e-06, "loss": 0.0002, "num_tokens": 21957975.0, "reward": -5.702986717224121, "reward_std": 9.752755165100098, "rewards/rollout_reward_func/mean": -5.702986717224121, "rewards/rollout_reward_func/std": 9.752755165100098, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.140628814697266, "sampling/sampling_logp_difference/mean": 0.250127911567688, "step": 808, "step_time": 31.93919279499096 }, { "clip_ratio/high_max": 0.003639567701611668, "clip_ratio/high_mean": 0.003639567701611668, "clip_ratio/low_mean": 0.004002096160547808, "clip_ratio/low_min": 0.004002096160547808, "clip_ratio/region_mean": 0.007641663833055645, "completions/clipped_ratio": 0.0, "completions/max_length": 2349.0, "completions/max_terminated_length": 2349.0, "completions/mean_length": 2220.875, "completions/mean_terminated_length": 2220.875, "completions/min_length": 2095.0, "completions/min_terminated_length": 2095.0, "entropy": 0.03239195863716304, "epoch": 3.2360001294400055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037852348759770393, "kl": 0.018587718484923244, "learning_rate": 7.4629799966990955e-06, "loss": 0.0002, "num_tokens": 22006403.0, "reward": -11.265435218811035, "reward_std": 12.682428359985352, "rewards/rollout_reward_func/mean": -11.265435218811035, "rewards/rollout_reward_func/std": 12.682428359985352, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.78147506713867, "sampling/sampling_logp_difference/mean": 0.2633892595767975, "step": 809, "step_time": 31.5214963999897 }, { "clip_ratio/high_max": 0.004388475092127919, "clip_ratio/high_mean": 0.004388475092127919, "clip_ratio/low_mean": 0.0034440689050825313, "clip_ratio/low_min": 0.0034440689050825313, "clip_ratio/region_mean": 0.007832544099073857, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 2261.5625, "completions/mean_terminated_length": 2261.5625, "completions/min_length": 2135.0, "completions/min_terminated_length": 2135.0, "entropy": 0.03188445442356169, "epoch": 3.240000129600005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.030768340453505516, "kl": 0.029817224014550447, "learning_rate": 7.462979996690549e-06, "loss": 0.0003, "num_tokens": 22055507.0, "reward": -8.857844352722168, "reward_std": 16.792564392089844, "rewards/rollout_reward_func/mean": -8.857844352722168, "rewards/rollout_reward_func/std": 16.792564392089844, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.187679290771484, "sampling/sampling_logp_difference/mean": 0.24966424703598022, "step": 810, "step_time": 32.2247999029787 }, { "clip_ratio/high_max": 0.0033352122991345823, "clip_ratio/high_mean": 0.0033352122991345823, "clip_ratio/low_mean": 0.004372056137071922, "clip_ratio/low_min": 0.004372056137071922, "clip_ratio/region_mean": 0.007707268407102674, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 2085.0, "completions/mean_terminated_length": 2085.0, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.033096152590587735, "epoch": 3.244000129760005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00405876012519002, "kl": 0.01870260643772781, "learning_rate": 7.462979996681993e-06, "loss": 0.0002, "num_tokens": 22101764.0, "reward": 3.0806071758270264, "reward_std": 25.648405075073242, "rewards/rollout_reward_func/mean": 3.0806071758270264, "rewards/rollout_reward_func/std": 25.648406982421875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.96875, "sampling/sampling_logp_difference/mean": 0.2545715272426605, "step": 811, "step_time": 31.241810847990564 }, { "clip_ratio/high_max": 0.00497727032052353, "clip_ratio/high_mean": 0.00497727032052353, "clip_ratio/low_mean": 0.0033955162798520178, "clip_ratio/low_min": 0.0033955162798520178, "clip_ratio/region_mean": 0.008372786571271718, "completions/clipped_ratio": 0.0, "completions/max_length": 2379.0, "completions/max_terminated_length": 2379.0, "completions/mean_length": 2203.1875, "completions/mean_terminated_length": 2203.1875, "completions/min_length": 1566.0, "completions/min_terminated_length": 1566.0, "entropy": 0.033221611520275474, "epoch": 3.2480001299200054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004086142871528864, "kl": 0.016584679251536727, "learning_rate": 7.462979996673424e-06, "loss": 0.0002, "num_tokens": 22149914.0, "reward": -1.8117163181304932, "reward_std": 32.14725875854492, "rewards/rollout_reward_func/mean": -1.8117163181304932, "rewards/rollout_reward_func/std": 32.14726257324219, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.8125, "sampling/sampling_logp_difference/mean": 0.2733454704284668, "step": 812, "step_time": 31.64038970497495 }, { "clip_ratio/high_max": 0.00291816376557108, "clip_ratio/high_mean": 0.00291816376557108, "clip_ratio/low_mean": 0.005331646418198943, "clip_ratio/low_min": 0.005331646418198943, "clip_ratio/region_mean": 0.0082498102565296, "completions/clipped_ratio": 0.0, "completions/max_length": 2336.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 2076.0625, "completions/mean_terminated_length": 2076.0625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.031185929663479328, "epoch": 3.252000130080005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003279436845332384, "kl": 0.016437674057669938, "learning_rate": 7.462979996664845e-06, "loss": 0.0002, "num_tokens": 22196026.0, "reward": 3.9615728855133057, "reward_std": 30.942928314208984, "rewards/rollout_reward_func/mean": 3.9615728855133057, "rewards/rollout_reward_func/std": 30.942930221557617, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.15625, "sampling/sampling_logp_difference/mean": 0.2681436240673065, "step": 813, "step_time": 31.248376729010488 }, { "clip_ratio/high_max": 0.0035765108186751604, "clip_ratio/high_mean": 0.0035765108186751604, "clip_ratio/low_mean": 0.004096393298823386, "clip_ratio/low_min": 0.004096393298823386, "clip_ratio/region_mean": 0.007672904001083225, "completions/clipped_ratio": 0.0, "completions/max_length": 2355.0, "completions/max_terminated_length": 2355.0, "completions/mean_length": 2238.3125, "completions/mean_terminated_length": 2238.3125, "completions/min_length": 2085.0, "completions/min_terminated_length": 2085.0, "entropy": 0.031840390525758266, "epoch": 3.2560001302400055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003649498103186488, "kl": 0.018499550758861005, "learning_rate": 7.462979996656255e-06, "loss": 0.0002, "num_tokens": 22244752.0, "reward": -6.4313130378723145, "reward_std": 18.13718605041504, "rewards/rollout_reward_func/mean": -6.4313130378723145, "rewards/rollout_reward_func/std": 18.137187957763672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.71875, "sampling/sampling_logp_difference/mean": 0.2615245580673218, "step": 814, "step_time": 31.984032135020243 }, { "clip_ratio/high_max": 0.0037182580563239753, "clip_ratio/high_mean": 0.0037182580563239753, "clip_ratio/low_mean": 0.0039086788456188515, "clip_ratio/low_min": 0.0039086788456188515, "clip_ratio/region_mean": 0.007626936887390912, "completions/clipped_ratio": 0.0, "completions/max_length": 2353.0, "completions/max_terminated_length": 2353.0, "completions/mean_length": 2096.75, "completions/mean_terminated_length": 2096.75, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.03344466211274266, "epoch": 3.260000130400005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005429632496088743, "kl": 0.01760838332120329, "learning_rate": 7.462979996647654e-06, "loss": 0.0002, "num_tokens": 22291174.0, "reward": -0.8034290075302124, "reward_std": 26.625980377197266, "rewards/rollout_reward_func/mean": -0.8034290075302124, "rewards/rollout_reward_func/std": 26.6259822845459, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.2578125, "sampling/sampling_logp_difference/mean": 0.26026782393455505, "step": 815, "step_time": 31.316276541983825 }, { "clip_ratio/high_max": 0.0036728721170220524, "clip_ratio/high_mean": 0.0036728721170220524, "clip_ratio/low_mean": 0.003174500074237585, "clip_ratio/low_min": 0.003174500074237585, "clip_ratio/region_mean": 0.006847372278571129, "completions/clipped_ratio": 0.0, "completions/max_length": 2379.0, "completions/max_terminated_length": 2379.0, "completions/mean_length": 2244.8125, "completions/mean_terminated_length": 2244.8125, "completions/min_length": 2139.0, "completions/min_terminated_length": 2139.0, "entropy": 0.03203397896140814, "epoch": 3.264000130560005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00938621535897255, "kl": 0.0185936865163967, "learning_rate": 7.462979996639041e-06, "loss": 0.0002, "num_tokens": 22340012.0, "reward": -5.980123043060303, "reward_std": 11.304360389709473, "rewards/rollout_reward_func/mean": -5.980123043060303, "rewards/rollout_reward_func/std": 11.304359436035156, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.20325469970703, "sampling/sampling_logp_difference/mean": 0.2560867667198181, "step": 816, "step_time": 31.541521499995724 }, { "clip_ratio/high_max": 0.004329043149482459, "clip_ratio/high_mean": 0.004329043149482459, "clip_ratio/low_mean": 0.0042780040530487895, "clip_ratio/low_min": 0.0042780040530487895, "clip_ratio/region_mean": 0.008607047202531248, "completions/clipped_ratio": 0.0, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 2298.0, "completions/mean_terminated_length": 2298.0, "completions/min_length": 2163.0, "completions/min_terminated_length": 2163.0, "entropy": 0.03322107158601284, "epoch": 3.2680001307200054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 2.8554916381835938, "kl": 0.3130266957450658, "learning_rate": 7.462979996630418e-06, "loss": 0.0033, "num_tokens": 22389727.0, "reward": -5.155925273895264, "reward_std": 7.714721202850342, "rewards/rollout_reward_func/mean": -5.155925273895264, "rewards/rollout_reward_func/std": 7.7147216796875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.5625, "sampling/sampling_logp_difference/mean": 0.25710806250572205, "step": 817, "step_time": 31.842020361989853 }, { "clip_ratio/high_max": 0.0033519252901896834, "clip_ratio/high_mean": 0.0033519252901896834, "clip_ratio/low_mean": 0.004565207607811317, "clip_ratio/low_min": 0.004565207607811317, "clip_ratio/region_mean": 0.007917132985312492, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 2291.0625, "completions/mean_terminated_length": 2291.0625, "completions/min_length": 2133.0, "completions/min_terminated_length": 2133.0, "entropy": 0.03138347901403904, "epoch": 3.272000130880005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012072385288774967, "kl": 0.020682373899035156, "learning_rate": 7.462979996621784e-06, "loss": 0.0002, "num_tokens": 22439325.0, "reward": -4.186701774597168, "reward_std": 13.578729629516602, "rewards/rollout_reward_func/mean": -4.186701774597168, "rewards/rollout_reward_func/std": 13.578730583190918, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.7421875, "sampling/sampling_logp_difference/mean": 0.2486032396554947, "step": 818, "step_time": 31.89116893101891 }, { "clip_ratio/high_max": 0.004580011067446321, "clip_ratio/high_mean": 0.004580011067446321, "clip_ratio/low_mean": 0.0037168334529269487, "clip_ratio/low_min": 0.0037168334529269487, "clip_ratio/region_mean": 0.008296844433061779, "completions/clipped_ratio": 0.0, "completions/max_length": 2315.0, "completions/max_terminated_length": 2315.0, "completions/mean_length": 2189.3125, "completions/mean_terminated_length": 2189.3125, "completions/min_length": 2061.0, "completions/min_terminated_length": 2061.0, "entropy": 0.03346578311175108, "epoch": 3.276000131040005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0054988074116408825, "kl": 0.019162304000928998, "learning_rate": 7.462979996613138e-06, "loss": 0.0002, "num_tokens": 22487232.0, "reward": -5.266231536865234, "reward_std": 7.628917694091797, "rewards/rollout_reward_func/mean": -5.266231536865234, "rewards/rollout_reward_func/std": 7.628918170928955, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.32813262939453, "sampling/sampling_logp_difference/mean": 0.2662208080291748, "step": 819, "step_time": 31.51555228199868 }, { "clip_ratio/high_max": 0.003725384420249611, "clip_ratio/high_mean": 0.003725384420249611, "clip_ratio/low_mean": 0.003709554555825889, "clip_ratio/low_min": 0.003709554555825889, "clip_ratio/region_mean": 0.007434938917867839, "completions/clipped_ratio": 0.0, "completions/max_length": 2400.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 2327.3125, "completions/mean_terminated_length": 2327.3125, "completions/min_length": 2220.0, "completions/min_terminated_length": 2220.0, "entropy": 0.033324209274724126, "epoch": 3.280000131200005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0040940227918326855, "kl": 0.01534352672751993, "learning_rate": 7.462979996604481e-06, "loss": 0.0002, "num_tokens": 22537435.0, "reward": -8.131537437438965, "reward_std": 9.222230911254883, "rewards/rollout_reward_func/mean": -8.131537437438965, "rewards/rollout_reward_func/std": 9.222230911254883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.125, "sampling/sampling_logp_difference/mean": 0.24040132761001587, "step": 820, "step_time": 32.07846268202411 }, { "clip_ratio/high_max": 0.0031115462334128097, "clip_ratio/high_mean": 0.0031115462334128097, "clip_ratio/low_mean": 0.004650312504963949, "clip_ratio/low_min": 0.004650312504963949, "clip_ratio/region_mean": 0.007761858752928674, "completions/clipped_ratio": 0.0, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 2068.875, "completions/mean_terminated_length": 2068.875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.03614141675643623, "epoch": 3.284000131360005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0048331813886761665, "kl": 0.016430045128799975, "learning_rate": 7.462979996595814e-06, "loss": 0.0002, "num_tokens": 22583419.0, "reward": -0.7391724586486816, "reward_std": 26.656156539916992, "rewards/rollout_reward_func/mean": -0.7391724586486816, "rewards/rollout_reward_func/std": 26.656156539916992, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.84375, "sampling/sampling_logp_difference/mean": 0.25884386897087097, "step": 821, "step_time": 30.630611796033918 }, { "clip_ratio/high_max": 0.004084985761437565, "clip_ratio/high_mean": 0.004084985761437565, "clip_ratio/low_mean": 0.004497418267419562, "clip_ratio/low_min": 0.004497418267419562, "clip_ratio/region_mean": 0.008582403999753296, "completions/clipped_ratio": 0.0, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 2215.1875, "completions/mean_terminated_length": 2215.1875, "completions/min_length": 1986.0, "completions/min_terminated_length": 1986.0, "entropy": 0.03500146442092955, "epoch": 3.2880001315200055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008355497382581234, "kl": 0.019738736795261502, "learning_rate": 7.4629799965871355e-06, "loss": 0.0002, "num_tokens": 22631763.0, "reward": -4.0697760581970215, "reward_std": 28.670907974243164, "rewards/rollout_reward_func/mean": -4.0697760581970215, "rewards/rollout_reward_func/std": 28.670909881591797, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.03125, "sampling/sampling_logp_difference/mean": 0.2608049213886261, "step": 822, "step_time": 31.69062327100255 }, { "clip_ratio/high_max": 0.004306062241084874, "clip_ratio/high_mean": 0.004306062241084874, "clip_ratio/low_mean": 0.0034217644424643368, "clip_ratio/low_min": 0.0034217644424643368, "clip_ratio/region_mean": 0.00772782665444538, "completions/clipped_ratio": 0.0, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 2141.0625, "completions/mean_terminated_length": 2141.0625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "entropy": 0.030419925693422556, "epoch": 3.292000131680005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003653360530734062, "kl": 0.01777454069815576, "learning_rate": 7.462979996578446e-06, "loss": 0.0002, "num_tokens": 22678931.0, "reward": -0.3352077007293701, "reward_std": 19.610511779785156, "rewards/rollout_reward_func/mean": -0.3352077007293701, "rewards/rollout_reward_func/std": 19.61051368713379, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.62507629394531, "sampling/sampling_logp_difference/mean": 0.24835005402565002, "step": 823, "step_time": 31.58723611298774 }, { "clip_ratio/high_max": 0.0037517963210120797, "clip_ratio/high_mean": 0.0037517963210120797, "clip_ratio/low_mean": 0.0033490571076981723, "clip_ratio/low_min": 0.0033490571076981723, "clip_ratio/region_mean": 0.007100853428710252, "completions/clipped_ratio": 0.0, "completions/max_length": 2369.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 2134.25, "completions/mean_terminated_length": 2134.25, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.03347874362953007, "epoch": 3.296000131840005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037181146908551455, "kl": 0.016231147572398186, "learning_rate": 7.462979996569746e-06, "loss": 0.0002, "num_tokens": 22725988.0, "reward": -5.297621726989746, "reward_std": 28.21294593811035, "rewards/rollout_reward_func/mean": -5.297621726989746, "rewards/rollout_reward_func/std": 28.21294593811035, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.25, "sampling/sampling_logp_difference/mean": 0.25277775526046753, "step": 824, "step_time": 31.29427434099489 }, { "clip_ratio/high_max": 0.004274097766028717, "clip_ratio/high_mean": 0.004274097766028717, "clip_ratio/low_mean": 0.003138874744763598, "clip_ratio/low_min": 0.003138874744763598, "clip_ratio/region_mean": 0.007412972510792315, "completions/clipped_ratio": 0.0, "completions/max_length": 2381.0, "completions/max_terminated_length": 2381.0, "completions/mean_length": 2182.6875, "completions/mean_terminated_length": 2182.6875, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.03296888479962945, "epoch": 3.3000001320000054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0041152359917759895, "kl": 0.016195453819818795, "learning_rate": 7.462979996561033e-06, "loss": 0.0002, "num_tokens": 22773853.0, "reward": -2.964935541152954, "reward_std": 27.71304702758789, "rewards/rollout_reward_func/mean": -2.964935541152954, "rewards/rollout_reward_func/std": 27.71304702758789, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.125, "sampling/sampling_logp_difference/mean": 0.24925175309181213, "step": 825, "step_time": 31.3460208010074 }, { "clip_ratio/high_max": 0.0032139649556484073, "clip_ratio/high_mean": 0.0032139649556484073, "clip_ratio/low_mean": 0.005087771860416979, "clip_ratio/low_min": 0.005087771860416979, "clip_ratio/region_mean": 0.008301736786961555, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 2219.3125, "completions/mean_terminated_length": 2219.3125, "completions/min_length": 2104.0, "completions/min_terminated_length": 2104.0, "entropy": 0.03267966629937291, "epoch": 3.304000132160005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003405089722946286, "kl": 0.018305926583707333, "learning_rate": 7.462979996552311e-06, "loss": 0.0002, "num_tokens": 22822272.0, "reward": -4.1264119148254395, "reward_std": 11.38449764251709, "rewards/rollout_reward_func/mean": -4.1264119148254395, "rewards/rollout_reward_func/std": 11.38449764251709, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.5859375, "sampling/sampling_logp_difference/mean": 0.2645089030265808, "step": 826, "step_time": 32.246800648994395 }, { "clip_ratio/high_max": 0.003436468221480027, "clip_ratio/high_mean": 0.003436468221480027, "clip_ratio/low_mean": 0.004291693039704114, "clip_ratio/low_min": 0.004291693039704114, "clip_ratio/region_mean": 0.007728161406703293, "completions/clipped_ratio": 0.0, "completions/max_length": 2387.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 2172.625, "completions/mean_terminated_length": 2172.625, "completions/min_length": 2050.0, "completions/min_terminated_length": 2050.0, "entropy": 0.03376442031003535, "epoch": 3.3080001323200056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0635727047920227, "kl": 0.02761620795354247, "learning_rate": 7.462979996543577e-06, "loss": 0.0003, "num_tokens": 22869899.0, "reward": -6.294310092926025, "reward_std": 12.38136100769043, "rewards/rollout_reward_func/mean": -6.294310092926025, "rewards/rollout_reward_func/std": 12.38136100769043, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.703125, "sampling/sampling_logp_difference/mean": 0.2655874192714691, "step": 827, "step_time": 31.82801354699768 }, { "clip_ratio/high_max": 0.004664909705752507, "clip_ratio/high_mean": 0.004664909705752507, "clip_ratio/low_mean": 0.003147879964672029, "clip_ratio/low_min": 0.003147879964672029, "clip_ratio/region_mean": 0.007812789641320705, "completions/clipped_ratio": 0.0, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 2255.75, "completions/mean_terminated_length": 2255.75, "completions/min_length": 2141.0, "completions/min_terminated_length": 2141.0, "entropy": 0.0322305818554014, "epoch": 3.312000132480005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003502268111333251, "kl": 0.019902938161976635, "learning_rate": 7.462979996534832e-06, "loss": 0.0002, "num_tokens": 22918909.0, "reward": -3.8010501861572266, "reward_std": 8.911959648132324, "rewards/rollout_reward_func/mean": -3.8010501861572266, "rewards/rollout_reward_func/std": 8.91196060180664, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.03125, "sampling/sampling_logp_difference/mean": 0.24736155569553375, "step": 828, "step_time": 31.943074731025263 }, { "clip_ratio/high_max": 0.004684168612584472, "clip_ratio/high_mean": 0.004684168612584472, "clip_ratio/low_mean": 0.003030155989108607, "clip_ratio/low_min": 0.003030155989108607, "clip_ratio/region_mean": 0.00771432468900457, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 2279.5, "completions/mean_terminated_length": 2279.5, "completions/min_length": 2213.0, "completions/min_terminated_length": 2213.0, "entropy": 0.03141591651365161, "epoch": 3.316000132640005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00492533203214407, "kl": 0.01876437128521502, "learning_rate": 7.462979996526076e-06, "loss": 0.0002, "num_tokens": 22968315.0, "reward": -8.741827964782715, "reward_std": 6.894232273101807, "rewards/rollout_reward_func/mean": -8.741827964782715, "rewards/rollout_reward_func/std": 6.894232273101807, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.078125, "sampling/sampling_logp_difference/mean": 0.2518554925918579, "step": 829, "step_time": 32.521565365983406 }, { "clip_ratio/high_max": 0.003974381688749418, "clip_ratio/high_mean": 0.003974381688749418, "clip_ratio/low_mean": 0.0034584267414174974, "clip_ratio/low_min": 0.0034584267414174974, "clip_ratio/region_mean": 0.007432808401063085, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 2262.3125, "completions/mean_terminated_length": 2262.3125, "completions/min_length": 2210.0, "completions/min_terminated_length": 2210.0, "entropy": 0.03218235797248781, "epoch": 3.3200001328000055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004893624223768711, "kl": 0.018033129163086414, "learning_rate": 7.462979996517309e-06, "loss": 0.0002, "num_tokens": 23017450.0, "reward": -5.0950117111206055, "reward_std": 10.493667602539062, "rewards/rollout_reward_func/mean": -5.0950117111206055, "rewards/rollout_reward_func/std": 10.493667602539062, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.9375, "sampling/sampling_logp_difference/mean": 0.2523108720779419, "step": 830, "step_time": 32.16120682800829 }, { "clip_ratio/high_max": 0.004677090357290581, "clip_ratio/high_mean": 0.004677090357290581, "clip_ratio/low_mean": 0.003355293214553967, "clip_ratio/low_min": 0.003355293214553967, "clip_ratio/region_mean": 0.008032383571844548, "completions/clipped_ratio": 0.0, "completions/max_length": 2273.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 2180.875, "completions/mean_terminated_length": 2180.875, "completions/min_length": 2087.0, "completions/min_terminated_length": 2087.0, "entropy": 0.03445166698656976, "epoch": 3.324000132960005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004173492081463337, "kl": 0.01950856251642108, "learning_rate": 7.462979996508531e-06, "loss": 0.0002, "num_tokens": 23065217.0, "reward": -8.800798416137695, "reward_std": 8.909732818603516, "rewards/rollout_reward_func/mean": -8.800798416137695, "rewards/rollout_reward_func/std": 8.909732818603516, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.10938262939453, "sampling/sampling_logp_difference/mean": 0.2558283507823944, "step": 831, "step_time": 31.103675945967552 }, { "clip_ratio/high_max": 0.0037162413354963064, "clip_ratio/high_mean": 0.0037162413354963064, "clip_ratio/low_mean": 0.003983153466833755, "clip_ratio/low_min": 0.003983153466833755, "clip_ratio/region_mean": 0.007699394831433892, "completions/clipped_ratio": 0.0, "completions/max_length": 2366.0, "completions/max_terminated_length": 2366.0, "completions/mean_length": 2197.1875, "completions/mean_terminated_length": 2197.1875, "completions/min_length": 2079.0, "completions/min_terminated_length": 2079.0, "entropy": 0.03385423822328448, "epoch": 3.3280001331200056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006005849689245224, "kl": 0.0197971616871655, "learning_rate": 7.462979996499741e-06, "loss": 0.0002, "num_tokens": 23113254.0, "reward": -8.421262741088867, "reward_std": 8.625657081604004, "rewards/rollout_reward_func/mean": -8.421262741088867, "rewards/rollout_reward_func/std": 8.625657081604004, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.96875, "sampling/sampling_logp_difference/mean": 0.26768529415130615, "step": 832, "step_time": 31.954154860024573 }, { "clip_ratio/high_max": 0.0038818386383354664, "clip_ratio/high_mean": 0.0038818386383354664, "clip_ratio/low_mean": 0.0035713394463527948, "clip_ratio/low_min": 0.0035713394463527948, "clip_ratio/region_mean": 0.007453178113792092, "completions/clipped_ratio": 0.0, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 2248.3125, "completions/mean_terminated_length": 2248.3125, "completions/min_length": 2072.0, "completions/min_terminated_length": 2072.0, "entropy": 0.03382634883746505, "epoch": 3.3320001332800054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005716449115425348, "kl": 0.018530648667365313, "learning_rate": 7.462979996490941e-06, "loss": 0.0002, "num_tokens": 23162134.0, "reward": -12.073808670043945, "reward_std": 10.271639823913574, "rewards/rollout_reward_func/mean": -12.073808670043945, "rewards/rollout_reward_func/std": 10.271639823913574, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.195316314697266, "sampling/sampling_logp_difference/mean": 0.25087741017341614, "step": 833, "step_time": 32.03733530097816 }, { "clip_ratio/high_max": 0.00481117915478535, "clip_ratio/high_mean": 0.00481117915478535, "clip_ratio/low_mean": 0.0031775643583387136, "clip_ratio/low_min": 0.0031775643583387136, "clip_ratio/region_mean": 0.007988743484020233, "completions/clipped_ratio": 0.0, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 2248.25, "completions/mean_terminated_length": 2248.25, "completions/min_length": 2082.0, "completions/min_terminated_length": 2082.0, "entropy": 0.032705443212762475, "epoch": 3.336000133440005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.07749857753515244, "kl": 0.03318972676061094, "learning_rate": 7.4629799964821305e-06, "loss": 0.0004, "num_tokens": 23211021.0, "reward": -8.323444366455078, "reward_std": 10.766986846923828, "rewards/rollout_reward_func/mean": -8.323444366455078, "rewards/rollout_reward_func/std": 10.766986846923828, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.39844512939453, "sampling/sampling_logp_difference/mean": 0.2576461732387543, "step": 834, "step_time": 31.77902250000625 }, { "clip_ratio/high_max": 0.0026571755879558623, "clip_ratio/high_mean": 0.0026571755879558623, "clip_ratio/low_mean": 0.0044978011574130505, "clip_ratio/low_min": 0.0044978011574130505, "clip_ratio/region_mean": 0.007154976774472743, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 2316.4375, "completions/mean_terminated_length": 2316.4375, "completions/min_length": 2174.0, "completions/min_terminated_length": 2174.0, "entropy": 0.03354455647058785, "epoch": 3.3400001336000055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0150920944288373, "kl": 0.02036031079478562, "learning_rate": 7.462979996473307e-06, "loss": 0.0002, "num_tokens": 23261023.0, "reward": -5.422621250152588, "reward_std": 9.735453605651855, "rewards/rollout_reward_func/mean": -5.422621250152588, "rewards/rollout_reward_func/std": 9.735454559326172, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.3125, "sampling/sampling_logp_difference/mean": 0.25101691484451294, "step": 835, "step_time": 31.890482095011976 }, { "clip_ratio/high_max": 0.004315186641179025, "clip_ratio/high_mean": 0.004315186641179025, "clip_ratio/low_mean": 0.003560534445568919, "clip_ratio/low_min": 0.003560534445568919, "clip_ratio/region_mean": 0.007875721086747944, "completions/clipped_ratio": 0.0, "completions/max_length": 2254.0, "completions/max_terminated_length": 2254.0, "completions/mean_length": 2046.875, "completions/mean_terminated_length": 2046.875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.03613452007994056, "epoch": 3.344000133760005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004050578456372023, "kl": 0.017064608866348863, "learning_rate": 7.462979996464475e-06, "loss": 0.0002, "num_tokens": 23306639.0, "reward": -0.30184435844421387, "reward_std": 27.32853126525879, "rewards/rollout_reward_func/mean": -0.30184435844421387, "rewards/rollout_reward_func/std": 27.32853126525879, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.046875, "sampling/sampling_logp_difference/mean": 0.27052393555641174, "step": 836, "step_time": 30.406834968016483 }, { "clip_ratio/high_max": 0.0031452847470063716, "clip_ratio/high_mean": 0.0031452847470063716, "clip_ratio/low_mean": 0.004098258214071393, "clip_ratio/low_min": 0.004098258214071393, "clip_ratio/region_mean": 0.007243542990181595, "completions/clipped_ratio": 0.0, "completions/max_length": 2365.0, "completions/max_terminated_length": 2365.0, "completions/mean_length": 2216.5625, "completions/mean_terminated_length": 2216.5625, "completions/min_length": 2078.0, "completions/min_terminated_length": 2078.0, "entropy": 0.035175768192857504, "epoch": 3.348000133920006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032504007685929537, "kl": 0.0190865327604115, "learning_rate": 7.462979996455631e-06, "loss": 0.0002, "num_tokens": 23354985.0, "reward": -8.06247329711914, "reward_std": 5.753068923950195, "rewards/rollout_reward_func/mean": -8.06247329711914, "rewards/rollout_reward_func/std": 5.753068923950195, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.75, "sampling/sampling_logp_difference/mean": 0.2573494017124176, "step": 837, "step_time": 31.991957983991597 }, { "clip_ratio/high_max": 0.0037293130153557286, "clip_ratio/high_mean": 0.0037293130153557286, "clip_ratio/low_mean": 0.0036203155177645385, "clip_ratio/low_min": 0.0036203155177645385, "clip_ratio/region_mean": 0.007349628605879843, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 2204.5, "completions/mean_terminated_length": 2204.5, "completions/min_length": 2088.0, "completions/min_terminated_length": 2088.0, "entropy": 0.033405084162950516, "epoch": 3.3520001340800054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036440908443182707, "kl": 0.017897737328894436, "learning_rate": 7.462979996446775e-06, "loss": 0.0002, "num_tokens": 23403156.0, "reward": -5.272979736328125, "reward_std": 10.380927085876465, "rewards/rollout_reward_func/mean": -5.272979736328125, "rewards/rollout_reward_func/std": 10.380926132202148, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.46875, "sampling/sampling_logp_difference/mean": 0.2545751929283142, "step": 838, "step_time": 32.10014985599264 }, { "clip_ratio/high_max": 0.003677341330330819, "clip_ratio/high_mean": 0.003677341330330819, "clip_ratio/low_mean": 0.004196266701910645, "clip_ratio/low_min": 0.004196266701910645, "clip_ratio/region_mean": 0.007873608148656785, "completions/clipped_ratio": 0.0, "completions/max_length": 2261.0, "completions/max_terminated_length": 2261.0, "completions/mean_length": 2167.75, "completions/mean_terminated_length": 2167.75, "completions/min_length": 2046.0, "completions/min_terminated_length": 2046.0, "entropy": 0.035950206220149994, "epoch": 3.356000134240005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003935269545763731, "kl": 0.01846789033152163, "learning_rate": 7.462979996437909e-06, "loss": 0.0002, "num_tokens": 23450727.0, "reward": 0.39777612686157227, "reward_std": 24.548498153686523, "rewards/rollout_reward_func/mean": 0.39777612686157227, "rewards/rollout_reward_func/std": 24.548498153686523, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.234375, "sampling/sampling_logp_difference/mean": 0.26705262064933777, "step": 839, "step_time": 30.870099930034485 }, { "clip_ratio/high_max": 0.0036403572594281286, "clip_ratio/high_mean": 0.0036403572594281286, "clip_ratio/low_mean": 0.0040695531060919166, "clip_ratio/low_min": 0.0040695531060919166, "clip_ratio/region_mean": 0.007709910394623876, "completions/clipped_ratio": 0.0, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 2122.625, "completions/mean_terminated_length": 2122.625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "entropy": 0.035601824754849076, "epoch": 3.3600001344000056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004233799409121275, "kl": 0.0191325421910733, "learning_rate": 7.462979996429032e-06, "loss": 0.0002, "num_tokens": 23497604.0, "reward": -2.280993938446045, "reward_std": 29.587697982788086, "rewards/rollout_reward_func/mean": -2.280993938446045, "rewards/rollout_reward_func/std": 29.58769989013672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.375, "sampling/sampling_logp_difference/mean": 0.2638881504535675, "step": 840, "step_time": 31.277165085964953 }, { "clip_ratio/high_max": 0.0038942107930779457, "clip_ratio/high_mean": 0.0038942107930779457, "clip_ratio/low_mean": 0.003885396581608802, "clip_ratio/low_min": 0.003885396581608802, "clip_ratio/region_mean": 0.007779607374686748, "completions/clipped_ratio": 0.0, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 2232.3125, "completions/mean_terminated_length": 2232.3125, "completions/min_length": 2074.0, "completions/min_terminated_length": 2074.0, "entropy": 0.032161703100427985, "epoch": 3.364000134560005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011637821793556213, "kl": 0.019948195898905396, "learning_rate": 7.462979996420143e-06, "loss": 0.0002, "num_tokens": 23546217.0, "reward": -5.806368350982666, "reward_std": 9.104156494140625, "rewards/rollout_reward_func/mean": -5.806368350982666, "rewards/rollout_reward_func/std": 9.104156494140625, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.6875, "sampling/sampling_logp_difference/mean": 0.25856515765190125, "step": 841, "step_time": 31.916647425998235 }, { "clip_ratio/high_max": 0.004392407165141776, "clip_ratio/high_mean": 0.004392407165141776, "clip_ratio/low_mean": 0.004263006732799113, "clip_ratio/low_min": 0.004263006732799113, "clip_ratio/region_mean": 0.00865541392704472, "completions/clipped_ratio": 0.0, "completions/max_length": 2397.0, "completions/max_terminated_length": 2397.0, "completions/mean_length": 2275.125, "completions/mean_terminated_length": 2275.125, "completions/min_length": 2067.0, "completions/min_terminated_length": 2067.0, "entropy": 0.03146447171457112, "epoch": 3.368000134720005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010296215303242207, "kl": 0.021169670741073787, "learning_rate": 7.462979996411243e-06, "loss": 0.0002, "num_tokens": 23595544.0, "reward": -6.72067928314209, "reward_std": 8.782350540161133, "rewards/rollout_reward_func/mean": -6.72067928314209, "rewards/rollout_reward_func/std": 8.78235149383545, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.890655517578125, "sampling/sampling_logp_difference/mean": 0.25490230321884155, "step": 842, "step_time": 32.00151580199599 }, { "clip_ratio/high_max": 0.003245260246330872, "clip_ratio/high_mean": 0.003245260246330872, "clip_ratio/low_mean": 0.0039381751557812095, "clip_ratio/low_min": 0.0039381751557812095, "clip_ratio/region_mean": 0.007183435431215912, "completions/clipped_ratio": 0.0, "completions/max_length": 2290.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 2237.875, "completions/mean_terminated_length": 2237.875, "completions/min_length": 2194.0, "completions/min_terminated_length": 2194.0, "entropy": 0.03239405807107687, "epoch": 3.3720001348800055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006858445703983307, "kl": 0.018234771909192204, "learning_rate": 7.462979996402332e-06, "loss": 0.0002, "num_tokens": 23644257.0, "reward": -2.87471079826355, "reward_std": 10.788418769836426, "rewards/rollout_reward_func/mean": -2.87471079826355, "rewards/rollout_reward_func/std": 10.788418769836426, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.632835388183594, "sampling/sampling_logp_difference/mean": 0.25129979848861694, "step": 843, "step_time": 31.447240690991748 }, { "clip_ratio/high_max": 0.003435570193687454, "clip_ratio/high_mean": 0.003435570193687454, "clip_ratio/low_mean": 0.0038441298820544034, "clip_ratio/low_min": 0.0038441298820544034, "clip_ratio/region_mean": 0.007279700017534196, "completions/clipped_ratio": 0.0, "completions/max_length": 2397.0, "completions/max_terminated_length": 2397.0, "completions/mean_length": 2292.75, "completions/mean_terminated_length": 2292.75, "completions/min_length": 2121.0, "completions/min_terminated_length": 2121.0, "entropy": 0.0319212032482028, "epoch": 3.376000135040005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00934585276991129, "kl": 0.016169220209121704, "learning_rate": 7.4629799963934115e-06, "loss": 0.0002, "num_tokens": 23693884.0, "reward": -6.089508056640625, "reward_std": 7.628510475158691, "rewards/rollout_reward_func/mean": -6.089508056640625, "rewards/rollout_reward_func/std": 7.62851095199585, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.359375, "sampling/sampling_logp_difference/mean": 0.2550605535507202, "step": 844, "step_time": 31.97374916699482 }, { "clip_ratio/high_max": 0.0028354070091154426, "clip_ratio/high_mean": 0.0028354070091154426, "clip_ratio/low_mean": 0.005331344087608159, "clip_ratio/low_min": 0.005331344087608159, "clip_ratio/region_mean": 0.008166751125827432, "completions/clipped_ratio": 0.0, "completions/max_length": 2297.0, "completions/max_terminated_length": 2297.0, "completions/mean_length": 2199.6875, "completions/mean_terminated_length": 2199.6875, "completions/min_length": 2118.0, "completions/min_terminated_length": 2118.0, "entropy": 0.034663021331653, "epoch": 3.3800001352000057e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.12352042645215988, "kl": 0.07320129394065589, "learning_rate": 7.462979996384479e-06, "loss": 0.0008, "num_tokens": 23741953.0, "reward": -7.27752161026001, "reward_std": 14.362142562866211, "rewards/rollout_reward_func/mean": -7.27752161026001, "rewards/rollout_reward_func/std": 14.362142562866211, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.7734375, "sampling/sampling_logp_difference/mean": 0.2515939176082611, "step": 845, "step_time": 31.381384951004293 }, { "clip_ratio/high_max": 0.005049968225648627, "clip_ratio/high_mean": 0.005049968225648627, "clip_ratio/low_mean": 0.00296747763059102, "clip_ratio/low_min": 0.00296747763059102, "clip_ratio/region_mean": 0.008017445856239647, "completions/clipped_ratio": 0.0, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 2198.0, "completions/mean_terminated_length": 2198.0, "completions/min_length": 2110.0, "completions/min_terminated_length": 2110.0, "entropy": 0.03236609557643533, "epoch": 3.3840001353600054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008378368802368641, "kl": 0.02037718368228525, "learning_rate": 7.462979996375534e-06, "loss": 0.0002, "num_tokens": 23790013.0, "reward": -4.450263500213623, "reward_std": 14.78010368347168, "rewards/rollout_reward_func/mean": -4.450263500213623, "rewards/rollout_reward_func/std": 14.78010368347168, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.78125, "sampling/sampling_logp_difference/mean": 0.26869863271713257, "step": 846, "step_time": 31.363010109009338 }, { "clip_ratio/high_max": 0.004423903825227171, "clip_ratio/high_mean": 0.004423903825227171, "clip_ratio/low_mean": 0.003376641689101234, "clip_ratio/low_min": 0.003376641689101234, "clip_ratio/region_mean": 0.007800545543432236, "completions/clipped_ratio": 0.0, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 2213.1875, "completions/mean_terminated_length": 2213.1875, "completions/min_length": 2105.0, "completions/min_terminated_length": 2105.0, "entropy": 0.035528391133993864, "epoch": 3.388000135520005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02409004047513008, "kl": 0.024651099229231477, "learning_rate": 7.462979996366579e-06, "loss": 0.0003, "num_tokens": 23838303.0, "reward": -10.288047790527344, "reward_std": 10.753256797790527, "rewards/rollout_reward_func/mean": -10.288047790527344, "rewards/rollout_reward_func/std": 10.753257751464844, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.125, "sampling/sampling_logp_difference/mean": 0.2671124339103699, "step": 847, "step_time": 31.891075239967904 }, { "clip_ratio/high_max": 0.0026974866341333836, "clip_ratio/high_mean": 0.0026974866341333836, "clip_ratio/low_mean": 0.004958723031450063, "clip_ratio/low_min": 0.004958723031450063, "clip_ratio/region_mean": 0.007656209752894938, "completions/clipped_ratio": 0.0, "completions/max_length": 2324.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 2069.125, "completions/mean_terminated_length": 2069.125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.034778851782903075, "epoch": 3.3920001356800056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003946595825254917, "kl": 0.01760428212583065, "learning_rate": 7.462979996357613e-06, "loss": 0.0002, "num_tokens": 23884300.0, "reward": 12.307735443115234, "reward_std": 40.93850326538086, "rewards/rollout_reward_func/mean": 12.307735443115234, "rewards/rollout_reward_func/std": 40.93850326538086, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.171878814697266, "sampling/sampling_logp_difference/mean": 0.2674761116504669, "step": 848, "step_time": 31.431569206994027 }, { "clip_ratio/high_max": 0.003736131708137691, "clip_ratio/high_mean": 0.003736131708137691, "clip_ratio/low_mean": 0.004078619764186442, "clip_ratio/low_min": 0.004078619764186442, "clip_ratio/region_mean": 0.007814751705154777, "completions/clipped_ratio": 0.0, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 2253.5, "completions/mean_terminated_length": 2253.5, "completions/min_length": 2106.0, "completions/min_terminated_length": 2106.0, "entropy": 0.03162485919892788, "epoch": 3.396000135840005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00836092233657837, "kl": 0.01702052040491253, "learning_rate": 7.462979996348636e-06, "loss": 0.0002, "num_tokens": 23933267.0, "reward": -8.01584243774414, "reward_std": 8.068598747253418, "rewards/rollout_reward_func/mean": -8.01584243774414, "rewards/rollout_reward_func/std": 8.068598747253418, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.1875, "sampling/sampling_logp_difference/mean": 0.2558474540710449, "step": 849, "step_time": 32.17737093301548 }, { "clip_ratio/high_max": 0.005171746219275519, "clip_ratio/high_mean": 0.005171746219275519, "clip_ratio/low_mean": 0.003649448335636407, "clip_ratio/low_min": 0.003649448335636407, "clip_ratio/region_mean": 0.008821194467600435, "completions/clipped_ratio": 0.0, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 2174.1875, "completions/mean_terminated_length": 2174.1875, "completions/min_length": 1909.0, "completions/min_terminated_length": 1909.0, "entropy": 0.03320848848670721, "epoch": 3.400000136000006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007115034386515617, "kl": 0.019829246681183577, "learning_rate": 7.462979996339649e-06, "loss": 0.0002, "num_tokens": 23980933.0, "reward": -7.088540554046631, "reward_std": 17.380895614624023, "rewards/rollout_reward_func/mean": -7.088540554046631, "rewards/rollout_reward_func/std": 17.380895614624023, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.53937911987305, "sampling/sampling_logp_difference/mean": 0.2622906267642975, "step": 850, "step_time": 32.037103288050275 }, { "clip_ratio/high_max": 0.004175362293608487, "clip_ratio/high_mean": 0.004175362293608487, "clip_ratio/low_mean": 0.004103496030438691, "clip_ratio/low_min": 0.004103496030438691, "clip_ratio/region_mean": 0.008278858382254839, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 2237.75, "completions/mean_terminated_length": 2237.75, "completions/min_length": 2090.0, "completions/min_terminated_length": 2090.0, "entropy": 0.031597204972058535, "epoch": 3.4040001361600055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0054132044315338135, "kl": 0.020818295190110803, "learning_rate": 7.46297999633065e-06, "loss": 0.0002, "num_tokens": 24029644.0, "reward": -7.677581310272217, "reward_std": 9.614702224731445, "rewards/rollout_reward_func/mean": -7.677581310272217, "rewards/rollout_reward_func/std": 9.614702224731445, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.28125, "sampling/sampling_logp_difference/mean": 0.2608140707015991, "step": 851, "step_time": 31.98768709199794 }, { "clip_ratio/high_max": 0.002777142100967467, "clip_ratio/high_mean": 0.002777142100967467, "clip_ratio/low_mean": 0.0047998258669395, "clip_ratio/low_min": 0.0047998258669395, "clip_ratio/region_mean": 0.007576967997010797, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 2162.9375, "completions/mean_terminated_length": 2162.9375, "completions/min_length": 2094.0, "completions/min_terminated_length": 2094.0, "entropy": 0.03343571815639734, "epoch": 3.408000136320005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005060007330030203, "kl": 0.02015663601923734, "learning_rate": 7.46297999632164e-06, "loss": 0.0002, "num_tokens": 24077125.0, "reward": -6.596558570861816, "reward_std": 8.152413368225098, "rewards/rollout_reward_func/mean": -6.596558570861816, "rewards/rollout_reward_func/std": 8.152414321899414, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.89359664916992, "sampling/sampling_logp_difference/mean": 0.26154592633247375, "step": 852, "step_time": 31.26712964804028 }, { "clip_ratio/high_max": 0.005213629920035601, "clip_ratio/high_mean": 0.005213629920035601, "clip_ratio/low_mean": 0.004528792254859582, "clip_ratio/low_min": 0.004528792254859582, "clip_ratio/region_mean": 0.009742422262206674, "completions/clipped_ratio": 0.0, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 2181.625, "completions/mean_terminated_length": 2181.625, "completions/min_length": 2068.0, "completions/min_terminated_length": 2068.0, "entropy": 0.03026609029620886, "epoch": 3.4120001364800056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006371948402374983, "kl": 0.019245621282607317, "learning_rate": 7.462979996312619e-06, "loss": 0.0002, "num_tokens": 24124909.0, "reward": -1.0805063247680664, "reward_std": 24.4084529876709, "rewards/rollout_reward_func/mean": -1.0805063247680664, "rewards/rollout_reward_func/std": 24.40845489501953, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.0, "sampling/sampling_logp_difference/mean": 0.26031866669654846, "step": 853, "step_time": 31.755992729988066 }, { "clip_ratio/high_max": 0.002885347814299166, "clip_ratio/high_mean": 0.002885347814299166, "clip_ratio/low_mean": 0.005129853409016505, "clip_ratio/low_min": 0.005129853409016505, "clip_ratio/region_mean": 0.008015201310627162, "completions/clipped_ratio": 0.0, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 2199.6875, "completions/mean_terminated_length": 2199.6875, "completions/min_length": 1865.0, "completions/min_terminated_length": 1865.0, "entropy": 0.03288715402595699, "epoch": 3.4160001366400054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008362664841115475, "kl": 0.021188258659094572, "learning_rate": 7.462979996303585e-06, "loss": 0.0002, "num_tokens": 24172998.0, "reward": 2.8834915161132812, "reward_std": 29.87828826904297, "rewards/rollout_reward_func/mean": 2.8834915161132812, "rewards/rollout_reward_func/std": 29.87828826904297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.40625, "sampling/sampling_logp_difference/mean": 0.27288323640823364, "step": 854, "step_time": 31.533565616002306 }, { "clip_ratio/high_max": 0.0029410068527795374, "clip_ratio/high_mean": 0.0029410068527795374, "clip_ratio/low_mean": 0.004646354529540986, "clip_ratio/low_min": 0.004646354529540986, "clip_ratio/region_mean": 0.007587361324112862, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 2214.875, "completions/mean_terminated_length": 2214.875, "completions/min_length": 2074.0, "completions/min_terminated_length": 2074.0, "entropy": 0.031043897848576307, "epoch": 3.420000136800006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004984457045793533, "kl": 0.018325399258174002, "learning_rate": 7.462979996294542e-06, "loss": 0.0002, "num_tokens": 24221347.0, "reward": -2.592607021331787, "reward_std": 11.38313102722168, "rewards/rollout_reward_func/mean": -2.592607021331787, "rewards/rollout_reward_func/std": 11.38313102722168, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.4375, "sampling/sampling_logp_difference/mean": 0.2561211884021759, "step": 855, "step_time": 31.453538632005802 }, { "clip_ratio/high_max": 0.00367637793533504, "clip_ratio/high_mean": 0.00367637793533504, "clip_ratio/low_mean": 0.003710253397002816, "clip_ratio/low_min": 0.003710253397002816, "clip_ratio/region_mean": 0.007386631390545517, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 2183.75, "completions/mean_terminated_length": 2183.75, "completions/min_length": 2121.0, "completions/min_terminated_length": 2121.0, "entropy": 0.032141398871317506, "epoch": 3.4240001369600055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007184882648289204, "kl": 0.020555197959765792, "learning_rate": 7.462979996285489e-06, "loss": 0.0002, "num_tokens": 24269166.0, "reward": -7.211322784423828, "reward_std": 8.905271530151367, "rewards/rollout_reward_func/mean": -7.211322784423828, "rewards/rollout_reward_func/std": 8.90527057647705, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.15625, "sampling/sampling_logp_difference/mean": 0.26191064715385437, "step": 856, "step_time": 31.300115109988838 }, { "clip_ratio/high_max": 0.0032004180538933724, "clip_ratio/high_mean": 0.0032004180538933724, "clip_ratio/low_mean": 0.0043496337020769715, "clip_ratio/low_min": 0.0043496337020769715, "clip_ratio/region_mean": 0.007550051843281835, "completions/clipped_ratio": 0.0, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 2128.625, "completions/mean_terminated_length": 2128.625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.030303229577839375, "epoch": 3.428000137120005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002982786623761058, "kl": 0.01580518565606326, "learning_rate": 7.4629799962764235e-06, "loss": 0.0002, "num_tokens": 24316139.0, "reward": -2.0208706855773926, "reward_std": 24.95405387878418, "rewards/rollout_reward_func/mean": -2.0208706855773926, "rewards/rollout_reward_func/std": 24.954057693481445, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.59375, "sampling/sampling_logp_difference/mean": 0.25763171911239624, "step": 857, "step_time": 35.62703197900555 }, { "clip_ratio/high_max": 0.0033499419514555484, "clip_ratio/high_mean": 0.0033499419514555484, "clip_ratio/low_mean": 0.004379881836939603, "clip_ratio/low_min": 0.004379881836939603, "clip_ratio/region_mean": 0.007729823933914304, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 2004.6875, "completions/mean_terminated_length": 2004.6875, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "entropy": 0.032466673757880926, "epoch": 3.432000137280006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005862332880496979, "kl": 0.018405526177957654, "learning_rate": 7.4629799962673475e-06, "loss": 0.0002, "num_tokens": 24361116.0, "reward": 4.082322120666504, "reward_std": 35.34458923339844, "rewards/rollout_reward_func/mean": 4.082322120666504, "rewards/rollout_reward_func/std": 35.34458923339844, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.12501907348633, "sampling/sampling_logp_difference/mean": 0.2617266774177551, "step": 858, "step_time": 31.07208096800605 }, { "clip_ratio/high_max": 0.005221120925853029, "clip_ratio/high_mean": 0.005221120925853029, "clip_ratio/low_mean": 0.003429395379498601, "clip_ratio/low_min": 0.003429395379498601, "clip_ratio/region_mean": 0.0086505162762478, "completions/clipped_ratio": 0.0, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 2217.75, "completions/mean_terminated_length": 2217.75, "completions/min_length": 2095.0, "completions/min_terminated_length": 2095.0, "entropy": 0.03193507017567754, "epoch": 3.4360001374400054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008581656962633133, "kl": 0.017876891768537462, "learning_rate": 7.462979996258261e-06, "loss": 0.0002, "num_tokens": 24409521.0, "reward": -7.310536861419678, "reward_std": 8.211145401000977, "rewards/rollout_reward_func/mean": -7.310536861419678, "rewards/rollout_reward_func/std": 8.211145401000977, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.95313262939453, "sampling/sampling_logp_difference/mean": 0.2634833753108978, "step": 859, "step_time": 32.310525502005476 }, { "clip_ratio/high_max": 0.003423965099500492, "clip_ratio/high_mean": 0.003423965099500492, "clip_ratio/low_mean": 0.004884421156020835, "clip_ratio/low_min": 0.004884421156020835, "clip_ratio/region_mean": 0.008308386255521327, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 2186.5, "completions/mean_terminated_length": 2186.5, "completions/min_length": 1791.0, "completions/min_terminated_length": 1791.0, "entropy": 0.032430657651275396, "epoch": 3.440000137600005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0040196822956204414, "kl": 0.018433239427395165, "learning_rate": 7.462979996249161e-06, "loss": 0.0002, "num_tokens": 24457403.0, "reward": 0.614159345626831, "reward_std": 18.64298439025879, "rewards/rollout_reward_func/mean": 0.614159345626831, "rewards/rollout_reward_func/std": 18.642982482910156, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.375, "sampling/sampling_logp_difference/mean": 0.2720777988433838, "step": 860, "step_time": 31.914197285019327 }, { "clip_ratio/high_max": 0.003522362618241459, "clip_ratio/high_mean": 0.003522362618241459, "clip_ratio/low_mean": 0.00504273473052308, "clip_ratio/low_min": 0.00504273473052308, "clip_ratio/region_mean": 0.008565097290556878, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 2224.0, "completions/mean_terminated_length": 2224.0, "completions/min_length": 2097.0, "completions/min_terminated_length": 2097.0, "entropy": 0.0325902309268713, "epoch": 3.4440001377600056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002307478804141283, "kl": 0.015399989322759211, "learning_rate": 7.462979996240052e-06, "loss": 0.0002, "num_tokens": 24505886.0, "reward": -6.228550910949707, "reward_std": 20.07801055908203, "rewards/rollout_reward_func/mean": -6.228550910949707, "rewards/rollout_reward_func/std": 20.078012466430664, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.4375, "sampling/sampling_logp_difference/mean": 0.2571166455745697, "step": 861, "step_time": 32.35293883099803 }, { "clip_ratio/high_max": 0.003746483416762203, "clip_ratio/high_mean": 0.003746483416762203, "clip_ratio/low_mean": 0.004648436937713996, "clip_ratio/low_min": 0.004648436937713996, "clip_ratio/region_mean": 0.008394920267164707, "completions/clipped_ratio": 0.0, "completions/max_length": 2378.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 2206.75, "completions/mean_terminated_length": 2206.75, "completions/min_length": 2089.0, "completions/min_terminated_length": 2089.0, "entropy": 0.03103918768465519, "epoch": 3.448000137920005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005875706672668457, "kl": 0.01858324685599655, "learning_rate": 7.462979996230931e-06, "loss": 0.0002, "num_tokens": 24554085.0, "reward": -4.540422439575195, "reward_std": 7.868175029754639, "rewards/rollout_reward_func/mean": -4.540422439575195, "rewards/rollout_reward_func/std": 7.8681745529174805, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.468753814697266, "sampling/sampling_logp_difference/mean": 0.2643545866012573, "step": 862, "step_time": 31.824117817974184 }, { "clip_ratio/high_max": 0.004548927943687886, "clip_ratio/high_mean": 0.004548927943687886, "clip_ratio/low_mean": 0.003023302328074351, "clip_ratio/low_min": 0.003023302328074351, "clip_ratio/region_mean": 0.0075722302426584065, "completions/clipped_ratio": 0.0, "completions/max_length": 2401.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 2268.375, "completions/mean_terminated_length": 2268.375, "completions/min_length": 2136.0, "completions/min_terminated_length": 2136.0, "entropy": 0.03125918540172279, "epoch": 3.452000138080006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003466258989647031, "kl": 0.015863924752920866, "learning_rate": 7.4629799962218e-06, "loss": 0.0002, "num_tokens": 24603309.0, "reward": -2.145383834838867, "reward_std": 11.043560981750488, "rewards/rollout_reward_func/mean": -2.145383834838867, "rewards/rollout_reward_func/std": 11.043561935424805, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.43750762939453, "sampling/sampling_logp_difference/mean": 0.24589955806732178, "step": 863, "step_time": 31.91174212702026 }, { "clip_ratio/high_max": 0.0034913015842903405, "clip_ratio/high_mean": 0.0034913015842903405, "clip_ratio/low_mean": 0.004004083981271833, "clip_ratio/low_min": 0.004004083981271833, "clip_ratio/region_mean": 0.007495385536458343, "completions/clipped_ratio": 0.0, "completions/max_length": 2203.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 2116.6875, "completions/mean_terminated_length": 2116.6875, "completions/min_length": 1596.0, "completions/min_terminated_length": 1596.0, "entropy": 0.033662872621789575, "epoch": 3.4560001382400055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006180096883326769, "kl": 0.02155235642567277, "learning_rate": 7.462979996212657e-06, "loss": 0.0002, "num_tokens": 24650045.0, "reward": 7.576971054077148, "reward_std": 38.989501953125, "rewards/rollout_reward_func/mean": 7.576971054077148, "rewards/rollout_reward_func/std": 38.989501953125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.486328125, "sampling/sampling_logp_difference/mean": 0.2787773609161377, "step": 864, "step_time": 31.306625568977324 }, { "clip_ratio/high_max": 0.005341383424820378, "clip_ratio/high_mean": 0.005341383424820378, "clip_ratio/low_mean": 0.0026418350753374398, "clip_ratio/low_min": 0.0026418350753374398, "clip_ratio/region_mean": 0.007983218529261649, "completions/clipped_ratio": 0.0, "completions/max_length": 2386.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 2254.125, "completions/mean_terminated_length": 2254.125, "completions/min_length": 2089.0, "completions/min_terminated_length": 2089.0, "entropy": 0.029938449850305915, "epoch": 3.460000138400005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007335405796766281, "kl": 0.019500617519952357, "learning_rate": 7.4629799962035024e-06, "loss": 0.0002, "num_tokens": 24699015.0, "reward": -11.170019149780273, "reward_std": 8.701305389404297, "rewards/rollout_reward_func/mean": -11.170019149780273, "rewards/rollout_reward_func/std": 8.701305389404297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.953125, "sampling/sampling_logp_difference/mean": 0.2547849118709564, "step": 865, "step_time": 31.847592955993605 }, { "clip_ratio/high_max": 0.0030994724947959185, "clip_ratio/high_mean": 0.0030994724947959185, "clip_ratio/low_mean": 0.00490532856201753, "clip_ratio/low_min": 0.00490532856201753, "clip_ratio/region_mean": 0.00800480111502111, "completions/clipped_ratio": 0.0, "completions/max_length": 2311.0, "completions/max_terminated_length": 2311.0, "completions/mean_length": 2172.5625, "completions/mean_terminated_length": 2172.5625, "completions/min_length": 1669.0, "completions/min_terminated_length": 1669.0, "entropy": 0.03143938980065286, "epoch": 3.4640001385600057e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003142219502478838, "kl": 0.01812814048025757, "learning_rate": 7.462979996194338e-06, "loss": 0.0002, "num_tokens": 24746652.0, "reward": 4.341385364532471, "reward_std": 35.105892181396484, "rewards/rollout_reward_func/mean": 4.341385364532471, "rewards/rollout_reward_func/std": 35.10588836669922, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.09375, "sampling/sampling_logp_difference/mean": 0.2777434289455414, "step": 866, "step_time": 31.677432156007853 }, { "clip_ratio/high_max": 0.004242899012751877, "clip_ratio/high_mean": 0.004242899012751877, "clip_ratio/low_mean": 0.0037273142370395362, "clip_ratio/low_min": 0.0037273142370395362, "clip_ratio/region_mean": 0.007970213307999074, "completions/clipped_ratio": 0.0, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 2229.5625, "completions/mean_terminated_length": 2229.5625, "completions/min_length": 2113.0, "completions/min_terminated_length": 2113.0, "entropy": 0.029427966801449656, "epoch": 3.4680001387200054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00376489432528615, "kl": 0.016072575701400638, "learning_rate": 7.4629799961851625e-06, "loss": 0.0002, "num_tokens": 24795215.0, "reward": -8.810468673706055, "reward_std": 5.578307628631592, "rewards/rollout_reward_func/mean": -8.810468673706055, "rewards/rollout_reward_func/std": 5.578307628631592, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.875, "sampling/sampling_logp_difference/mean": 0.25829577445983887, "step": 867, "step_time": 31.719095669017406 }, { "clip_ratio/high_max": 0.0026702892791945487, "clip_ratio/high_mean": 0.0026702892791945487, "clip_ratio/low_mean": 0.004469253879506141, "clip_ratio/low_min": 0.004469253879506141, "clip_ratio/region_mean": 0.007139543129596859, "completions/clipped_ratio": 0.0, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 2256.8125, "completions/mean_terminated_length": 2256.8125, "completions/min_length": 2079.0, "completions/min_terminated_length": 2079.0, "entropy": 0.028435949003323913, "epoch": 3.472000138880006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036970865912735462, "kl": 0.016861118725501, "learning_rate": 7.462979996175976e-06, "loss": 0.0002, "num_tokens": 24844246.0, "reward": -5.639452934265137, "reward_std": 10.46061897277832, "rewards/rollout_reward_func/mean": -5.639452934265137, "rewards/rollout_reward_func/std": 10.460618019104004, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.37696838378906, "sampling/sampling_logp_difference/mean": 0.25278186798095703, "step": 868, "step_time": 31.839975257986225 }, { "clip_ratio/high_max": 0.0047371433465741575, "clip_ratio/high_mean": 0.0047371433465741575, "clip_ratio/low_mean": 0.0032830304699018598, "clip_ratio/low_min": 0.0032830304699018598, "clip_ratio/region_mean": 0.008020173816476017, "completions/clipped_ratio": 0.0, "completions/max_length": 2339.0, "completions/max_terminated_length": 2339.0, "completions/mean_length": 2110.0625, "completions/mean_terminated_length": 2110.0625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.030987314647063613, "epoch": 3.4760001390400056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004132548812776804, "kl": 0.017387828789651394, "learning_rate": 7.4629799961667785e-06, "loss": 0.0002, "num_tokens": 24890911.0, "reward": -4.595208168029785, "reward_std": 25.51169776916504, "rewards/rollout_reward_func/mean": -4.595208168029785, "rewards/rollout_reward_func/std": 25.511699676513672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.746097564697266, "sampling/sampling_logp_difference/mean": 0.2568887174129486, "step": 869, "step_time": 31.331145626987563 }, { "clip_ratio/high_max": 0.002984187303809449, "clip_ratio/high_mean": 0.002984187303809449, "clip_ratio/low_mean": 0.004408689710544422, "clip_ratio/low_min": 0.004408689710544422, "clip_ratio/region_mean": 0.007392877014353871, "completions/clipped_ratio": 0.0, "completions/max_length": 2405.0, "completions/max_terminated_length": 2405.0, "completions/mean_length": 2134.0625, "completions/mean_terminated_length": 2134.0625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.031641937559470534, "epoch": 3.480000139200005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006954248994588852, "kl": 0.019259581924416125, "learning_rate": 7.462979996157569e-06, "loss": 0.0002, "num_tokens": 24937987.0, "reward": -2.132479190826416, "reward_std": 20.265256881713867, "rewards/rollout_reward_func/mean": -2.132479190826416, "rewards/rollout_reward_func/std": 20.265256881713867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.625, "sampling/sampling_logp_difference/mean": 0.25605133175849915, "step": 870, "step_time": 31.247682178014657 }, { "clip_ratio/high_max": 0.00410590015235357, "clip_ratio/high_mean": 0.00410590015235357, "clip_ratio/low_mean": 0.003914464730769396, "clip_ratio/low_min": 0.003914464730769396, "clip_ratio/region_mean": 0.008020364737603813, "completions/clipped_ratio": 0.0, "completions/max_length": 2364.0, "completions/max_terminated_length": 2364.0, "completions/mean_length": 2200.125, "completions/mean_terminated_length": 2200.125, "completions/min_length": 2123.0, "completions/min_terminated_length": 2123.0, "entropy": 0.03089888021349907, "epoch": 3.484000139360006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005407528951764107, "kl": 0.01782932016067207, "learning_rate": 7.46297999614835e-06, "loss": 0.0002, "num_tokens": 24986098.0, "reward": -1.225870132446289, "reward_std": 8.052265167236328, "rewards/rollout_reward_func/mean": -1.225870132446289, "rewards/rollout_reward_func/std": 8.052265167236328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.34375, "sampling/sampling_logp_difference/mean": 0.2602795660495758, "step": 871, "step_time": 31.920622384001035 }, { "clip_ratio/high_max": 0.004390740148664918, "clip_ratio/high_mean": 0.004390740148664918, "clip_ratio/low_mean": 0.004184802732197568, "clip_ratio/low_min": 0.004184802732197568, "clip_ratio/region_mean": 0.008575543004553765, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 2057.25, "completions/mean_terminated_length": 2057.25, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.030941466568037868, "epoch": 3.4880001395200055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025209574960172176, "kl": 0.015221928362734616, "learning_rate": 7.462979996139118e-06, "loss": 0.0002, "num_tokens": 25031888.0, "reward": -0.9981567859649658, "reward_std": 31.143718719482422, "rewards/rollout_reward_func/mean": -0.9981567859649658, "rewards/rollout_reward_func/std": 31.14371681213379, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.66407012939453, "sampling/sampling_logp_difference/mean": 0.2717151939868927, "step": 872, "step_time": 30.660407155999565 }, { "clip_ratio/high_max": 0.0027297193882986903, "clip_ratio/high_mean": 0.0027297193882986903, "clip_ratio/low_mean": 0.004017129627754912, "clip_ratio/low_min": 0.004017129627754912, "clip_ratio/region_mean": 0.0067468490451574326, "completions/clipped_ratio": 0.0, "completions/max_length": 2397.0, "completions/max_terminated_length": 2397.0, "completions/mean_length": 2240.5625, "completions/mean_terminated_length": 2240.5625, "completions/min_length": 2098.0, "completions/min_terminated_length": 2098.0, "entropy": 0.030449706129729748, "epoch": 3.492000139680006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031200493685901165, "kl": 0.016552122659049928, "learning_rate": 7.462979996129876e-06, "loss": 0.0002, "num_tokens": 25080649.0, "reward": -4.451284408569336, "reward_std": 10.456964492797852, "rewards/rollout_reward_func/mean": -4.451284408569336, "rewards/rollout_reward_func/std": 10.456963539123535, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.0625, "sampling/sampling_logp_difference/mean": 0.25953537225723267, "step": 873, "step_time": 32.12029834098939 }, { "clip_ratio/high_max": 0.003946326643927023, "clip_ratio/high_mean": 0.003946326643927023, "clip_ratio/low_mean": 0.0038287699571810663, "clip_ratio/low_min": 0.0038287699571810663, "clip_ratio/region_mean": 0.007775096572004259, "completions/clipped_ratio": 0.0, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 2296.0625, "completions/mean_terminated_length": 2296.0625, "completions/min_length": 2122.0, "completions/min_terminated_length": 2122.0, "entropy": 0.029227574821561575, "epoch": 3.4960001398400056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007894197478890419, "kl": 0.016573008615523577, "learning_rate": 7.462979996120624e-06, "loss": 0.0002, "num_tokens": 25130332.0, "reward": -5.990964889526367, "reward_std": 12.024816513061523, "rewards/rollout_reward_func/mean": -5.990964889526367, "rewards/rollout_reward_func/std": 12.02481746673584, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.1875, "sampling/sampling_logp_difference/mean": 0.24775472283363342, "step": 874, "step_time": 32.03349397899001 }, { "clip_ratio/high_max": 0.0041841303172986954, "clip_ratio/high_mean": 0.0041841303172986954, "clip_ratio/low_mean": 0.0031551101710647345, "clip_ratio/low_min": 0.0031551101710647345, "clip_ratio/region_mean": 0.0073392404592595994, "completions/clipped_ratio": 0.0, "completions/max_length": 2299.0, "completions/max_terminated_length": 2299.0, "completions/mean_length": 2240.9375, "completions/mean_terminated_length": 2240.9375, "completions/min_length": 2165.0, "completions/min_terminated_length": 2165.0, "entropy": 0.030599952675402164, "epoch": 3.5000001400000054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036102498415857553, "kl": 0.014779122895561159, "learning_rate": 7.462979996111359e-06, "loss": 0.0002, "num_tokens": 25179111.0, "reward": -8.503647804260254, "reward_std": 8.86219310760498, "rewards/rollout_reward_func/mean": -8.503647804260254, "rewards/rollout_reward_func/std": 8.862194061279297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 59.90625, "sampling/sampling_logp_difference/mean": 0.2556166350841522, "step": 875, "step_time": 31.418020467011956 }, { "clip_ratio/high_max": 0.004939730133628473, "clip_ratio/high_mean": 0.004939730133628473, "clip_ratio/low_mean": 0.0029790489934384823, "clip_ratio/low_min": 0.0029790489934384823, "clip_ratio/region_mean": 0.007918779097963125, "completions/clipped_ratio": 0.0, "completions/max_length": 2396.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 2206.9375, "completions/mean_terminated_length": 2206.9375, "completions/min_length": 2086.0, "completions/min_terminated_length": 2086.0, "entropy": 0.03026070282794535, "epoch": 3.504000140160006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038864698726683855, "kl": 0.01587369234766811, "learning_rate": 7.462979996102084e-06, "loss": 0.0002, "num_tokens": 25227317.0, "reward": -6.028453350067139, "reward_std": 12.3565673828125, "rewards/rollout_reward_func/mean": -6.028453350067139, "rewards/rollout_reward_func/std": 12.3565673828125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.3125, "sampling/sampling_logp_difference/mean": 0.2684865891933441, "step": 876, "step_time": 31.85515311200288 }, { "clip_ratio/high_max": 0.0038150475011207163, "clip_ratio/high_mean": 0.0038150475011207163, "clip_ratio/low_mean": 0.004089111753273755, "clip_ratio/low_min": 0.004089111753273755, "clip_ratio/region_mean": 0.007904159370809793, "completions/clipped_ratio": 0.0, "completions/max_length": 2311.0, "completions/max_terminated_length": 2311.0, "completions/mean_length": 2236.8125, "completions/mean_terminated_length": 2236.8125, "completions/min_length": 2188.0, "completions/min_terminated_length": 2188.0, "entropy": 0.031457177363336086, "epoch": 3.5080001403200055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002312677213922143, "kl": 0.015807464602403343, "learning_rate": 7.462979996092798e-06, "loss": 0.0002, "num_tokens": 25275987.0, "reward": -7.025562763214111, "reward_std": 8.50390625, "rewards/rollout_reward_func/mean": -7.025562763214111, "rewards/rollout_reward_func/std": 8.503907203674316, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.0390625, "sampling/sampling_logp_difference/mean": 0.258436381816864, "step": 877, "step_time": 31.923322872025892 }, { "clip_ratio/high_max": 0.0038885584799572825, "clip_ratio/high_mean": 0.0038885584799572825, "clip_ratio/low_mean": 0.0035194107331335545, "clip_ratio/low_min": 0.0035194107331335545, "clip_ratio/region_mean": 0.007407969154883176, "completions/clipped_ratio": 0.0, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 2201.5625, "completions/mean_terminated_length": 2201.5625, "completions/min_length": 2072.0, "completions/min_terminated_length": 2072.0, "entropy": 0.029433039482682943, "epoch": 3.512000140480006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0071957726031541824, "kl": 0.01785190822556615, "learning_rate": 7.462979996083499e-06, "loss": 0.0002, "num_tokens": 25324107.0, "reward": -5.427141189575195, "reward_std": 6.487157821655273, "rewards/rollout_reward_func/mean": -5.427141189575195, "rewards/rollout_reward_func/std": 6.487157821655273, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.28125, "sampling/sampling_logp_difference/mean": 0.2646220624446869, "step": 878, "step_time": 31.769828535005217 }, { "clip_ratio/high_max": 0.005098975409055129, "clip_ratio/high_mean": 0.005098975409055129, "clip_ratio/low_mean": 0.0026885124971158803, "clip_ratio/low_min": 0.0026885124971158803, "clip_ratio/region_mean": 0.007787487818859518, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 2233.0625, "completions/mean_terminated_length": 2233.0625, "completions/min_length": 2120.0, "completions/min_terminated_length": 2120.0, "entropy": 0.03135939431376755, "epoch": 3.516000140640006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004749651066958904, "kl": 0.015859193401411176, "learning_rate": 7.462979996074191e-06, "loss": 0.0002, "num_tokens": 25372746.0, "reward": -7.2496795654296875, "reward_std": 7.559586048126221, "rewards/rollout_reward_func/mean": -7.2496795654296875, "rewards/rollout_reward_func/std": 7.559586524963379, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.640625, "sampling/sampling_logp_difference/mean": 0.2628074586391449, "step": 879, "step_time": 31.948052284002188 }, { "clip_ratio/high_max": 0.004144374426687136, "clip_ratio/high_mean": 0.004144374426687136, "clip_ratio/low_mean": 0.0033167403016705066, "clip_ratio/low_min": 0.0033167403016705066, "clip_ratio/region_mean": 0.007461114670149982, "completions/clipped_ratio": 0.0, "completions/max_length": 2335.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 2227.5625, "completions/mean_terminated_length": 2227.5625, "completions/min_length": 2074.0, "completions/min_terminated_length": 2074.0, "entropy": 0.030932706780731678, "epoch": 3.5200001408000054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037165351677685976, "kl": 0.014723274391144514, "learning_rate": 7.462979996064872e-06, "loss": 0.0002, "num_tokens": 25421288.0, "reward": -3.0088231563568115, "reward_std": 8.862775802612305, "rewards/rollout_reward_func/mean": -3.0088231563568115, "rewards/rollout_reward_func/std": 8.862776756286621, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.8125, "sampling/sampling_logp_difference/mean": 0.2557716965675354, "step": 880, "step_time": 32.000923998013604 }, { "clip_ratio/high_max": 0.0033566775382496417, "clip_ratio/high_mean": 0.0033566775382496417, "clip_ratio/low_mean": 0.005142993904883042, "clip_ratio/low_min": 0.005142993904883042, "clip_ratio/region_mean": 0.008499671472236514, "completions/clipped_ratio": 0.0, "completions/max_length": 2553.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 2432.8125, "completions/mean_terminated_length": 2432.8125, "completions/min_length": 2320.0, "completions/min_terminated_length": 2320.0, "entropy": 0.029964234679937363, "epoch": 3.524000140960006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031489895191043615, "kl": 0.013716200250200927, "learning_rate": 7.462979996055542e-06, "loss": 0.0002, "num_tokens": 25473123.0, "reward": -1.6644996404647827, "reward_std": 19.659881591796875, "rewards/rollout_reward_func/mean": -1.6644996404647827, "rewards/rollout_reward_func/std": 19.659883499145508, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.304443359375, "sampling/sampling_logp_difference/mean": 0.24844703078269958, "step": 881, "step_time": 34.50648012297461 }, { "clip_ratio/high_max": 0.0043174317688681185, "clip_ratio/high_mean": 0.0043174317688681185, "clip_ratio/low_mean": 0.003725602087797597, "clip_ratio/low_min": 0.003725602087797597, "clip_ratio/region_mean": 0.008043033769354224, "completions/clipped_ratio": 0.0, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 2316.625, "completions/mean_terminated_length": 2316.625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "entropy": 0.031047145137563348, "epoch": 3.5280001411200056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006130940746515989, "kl": 0.016442572057712823, "learning_rate": 7.462979996046199e-06, "loss": 0.0002, "num_tokens": 25523093.0, "reward": 2.214979410171509, "reward_std": 27.687496185302734, "rewards/rollout_reward_func/mean": 2.214979410171509, "rewards/rollout_reward_func/std": 27.687496185302734, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.046875, "sampling/sampling_logp_difference/mean": 0.24695155024528503, "step": 882, "step_time": 33.06916088999424 }, { "clip_ratio/high_max": 0.004113701230380684, "clip_ratio/high_mean": 0.004113701230380684, "clip_ratio/low_mean": 0.004265533934812993, "clip_ratio/low_min": 0.004265533934812993, "clip_ratio/region_mean": 0.008379235165193677, "completions/clipped_ratio": 0.0, "completions/max_length": 2548.0, "completions/max_terminated_length": 2548.0, "completions/mean_length": 2327.3125, "completions/mean_terminated_length": 2327.3125, "completions/min_length": 1503.0, "completions/min_terminated_length": 1503.0, "entropy": 0.03402553265914321, "epoch": 3.532000141280005e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004015955608338118, "kl": 0.015500005800276995, "learning_rate": 7.462979996036847e-06, "loss": 0.0002, "num_tokens": 25573212.0, "reward": 5.424598217010498, "reward_std": 36.25648498535156, "rewards/rollout_reward_func/mean": 5.424598217010498, "rewards/rollout_reward_func/std": 36.25648880004883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.53125, "sampling/sampling_logp_difference/mean": 0.2674099802970886, "step": 883, "step_time": 33.36741296898981 }, { "clip_ratio/high_max": 0.0037066065124236047, "clip_ratio/high_mean": 0.0037066065124236047, "clip_ratio/low_mean": 0.004308705742005259, "clip_ratio/low_min": 0.004308705742005259, "clip_ratio/region_mean": 0.008015312196221203, "completions/clipped_ratio": 0.0, "completions/max_length": 2569.0, "completions/max_terminated_length": 2569.0, "completions/mean_length": 2390.5625, "completions/mean_terminated_length": 2390.5625, "completions/min_length": 1685.0, "completions/min_terminated_length": 1685.0, "entropy": 0.02984078205190599, "epoch": 3.536000141440006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038845748640596867, "kl": 0.016103474888950586, "learning_rate": 7.462979996027484e-06, "loss": 0.0002, "num_tokens": 25624373.0, "reward": -0.7145771980285645, "reward_std": 24.550853729248047, "rewards/rollout_reward_func/mean": -0.7145771980285645, "rewards/rollout_reward_func/std": 24.550853729248047, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.796875, "sampling/sampling_logp_difference/mean": 0.2555539906024933, "step": 884, "step_time": 34.05536495101114 }, { "clip_ratio/high_max": 0.0036581786116585135, "clip_ratio/high_mean": 0.0036581786116585135, "clip_ratio/low_mean": 0.0038686886546202004, "clip_ratio/low_min": 0.0038686886546202004, "clip_ratio/region_mean": 0.007526867266278714, "completions/clipped_ratio": 0.0, "completions/max_length": 2592.0, "completions/max_terminated_length": 2592.0, "completions/mean_length": 2440.375, "completions/mean_terminated_length": 2440.375, "completions/min_length": 2312.0, "completions/min_terminated_length": 2312.0, "entropy": 0.0324649706017226, "epoch": 3.5400001416000055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033797200303524733, "kl": 0.015524468151852489, "learning_rate": 7.46297999601811e-06, "loss": 0.0002, "num_tokens": 25676339.0, "reward": -4.1697306632995605, "reward_std": 12.595572471618652, "rewards/rollout_reward_func/mean": -4.1697306632995605, "rewards/rollout_reward_func/std": 12.595573425292969, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.3671875, "sampling/sampling_logp_difference/mean": 0.25654205679893494, "step": 885, "step_time": 34.48383025599469 }, { "clip_ratio/high_max": 0.0029730555543210357, "clip_ratio/high_mean": 0.0029730555543210357, "clip_ratio/low_mean": 0.005135295446962118, "clip_ratio/low_min": 0.005135295446962118, "clip_ratio/region_mean": 0.008108350972179323, "completions/clipped_ratio": 0.0, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 2464.375, "completions/mean_terminated_length": 2464.375, "completions/min_length": 2311.0, "completions/min_terminated_length": 2311.0, "entropy": 0.029431114671751857, "epoch": 3.544000141760006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004491792060434818, "kl": 0.014830568572506309, "learning_rate": 7.462979996008724e-06, "loss": 0.0002, "num_tokens": 25728682.0, "reward": -3.9537758827209473, "reward_std": 9.265303611755371, "rewards/rollout_reward_func/mean": -3.9537758827209473, "rewards/rollout_reward_func/std": 9.265303611755371, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.875, "sampling/sampling_logp_difference/mean": 0.24834804236888885, "step": 886, "step_time": 34.42906009098806 }, { "clip_ratio/high_max": 0.0031670978059992194, "clip_ratio/high_mean": 0.0031670978059992194, "clip_ratio/low_mean": 0.0044040945940651, "clip_ratio/low_min": 0.0044040945940651, "clip_ratio/region_mean": 0.007571192400064319, "completions/clipped_ratio": 0.0, "completions/max_length": 2545.0, "completions/max_terminated_length": 2545.0, "completions/mean_length": 2399.125, "completions/mean_terminated_length": 2399.125, "completions/min_length": 2283.0, "completions/min_terminated_length": 2283.0, "entropy": 0.03020193101838231, "epoch": 3.5480001419200057e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004896555095911026, "kl": 0.015736177330836654, "learning_rate": 7.462979995999327e-06, "loss": 0.0002, "num_tokens": 25779951.0, "reward": -0.40014147758483887, "reward_std": 11.070444107055664, "rewards/rollout_reward_func/mean": -0.40014147758483887, "rewards/rollout_reward_func/std": 11.070444107055664, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.25, "sampling/sampling_logp_difference/mean": 0.25191137194633484, "step": 887, "step_time": 33.43072744397796 }, { "clip_ratio/high_max": 0.005826612759847194, "clip_ratio/high_mean": 0.005826612759847194, "clip_ratio/low_mean": 0.0023399051860906184, "clip_ratio/low_min": 0.0023399051860906184, "clip_ratio/region_mean": 0.008166517887730151, "completions/clipped_ratio": 0.0, "completions/max_length": 2536.0, "completions/max_terminated_length": 2536.0, "completions/mean_length": 2420.0, "completions/mean_terminated_length": 2420.0, "completions/min_length": 2279.0, "completions/min_terminated_length": 2279.0, "entropy": 0.030664470745250583, "epoch": 3.5520001420800054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011056951247155666, "kl": 0.01897361606825143, "learning_rate": 7.462979995989919e-06, "loss": 0.0002, "num_tokens": 25831573.0, "reward": -4.932060718536377, "reward_std": 6.4574360847473145, "rewards/rollout_reward_func/mean": -4.932060718536377, "rewards/rollout_reward_func/std": 6.4574360847473145, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.52735137939453, "sampling/sampling_logp_difference/mean": 0.24618931114673615, "step": 888, "step_time": 33.77734707701893 }, { "clip_ratio/high_max": 0.005334513320121914, "clip_ratio/high_mean": 0.005334513320121914, "clip_ratio/low_mean": 0.002738884504651651, "clip_ratio/low_min": 0.002738884504651651, "clip_ratio/region_mean": 0.008073397970292717, "completions/clipped_ratio": 0.0, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 2425.5625, "completions/mean_terminated_length": 2425.5625, "completions/min_length": 2238.0, "completions/min_terminated_length": 2238.0, "entropy": 0.02966670272871852, "epoch": 3.556000142240006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00304230279289186, "kl": 0.014603638555854559, "learning_rate": 7.462979995980501e-06, "loss": 0.0002, "num_tokens": 25883289.0, "reward": -3.5888512134552, "reward_std": 11.613286018371582, "rewards/rollout_reward_func/mean": -3.5888512134552, "rewards/rollout_reward_func/std": 11.613286018371582, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.671875, "sampling/sampling_logp_difference/mean": 0.25079554319381714, "step": 889, "step_time": 34.80082576000132 }, { "clip_ratio/high_max": 0.0036423086712602526, "clip_ratio/high_mean": 0.0036423086712602526, "clip_ratio/low_mean": 0.004195129731670022, "clip_ratio/low_min": 0.004195129731670022, "clip_ratio/region_mean": 0.007837438373826444, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 2245.1875, "completions/mean_terminated_length": 2245.1875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.03208708227612078, "epoch": 3.5600001424000056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0042811729945242405, "kl": 0.016724353656172752, "learning_rate": 7.462979995971071e-06, "loss": 0.0002, "num_tokens": 25932096.0, "reward": 16.515600204467773, "reward_std": 41.35854721069336, "rewards/rollout_reward_func/mean": 16.515600204467773, "rewards/rollout_reward_func/std": 41.358551025390625, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.1875, "sampling/sampling_logp_difference/mean": 0.2688245475292206, "step": 890, "step_time": 33.03658306901343 }, { "clip_ratio/high_max": 0.0041095560300163925, "clip_ratio/high_mean": 0.0041095560300163925, "clip_ratio/low_mean": 0.003381477086804807, "clip_ratio/low_min": 0.003381477086804807, "clip_ratio/region_mean": 0.0074910331168212, "completions/clipped_ratio": 0.0, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 2429.375, "completions/mean_terminated_length": 2429.375, "completions/min_length": 2302.0, "completions/min_terminated_length": 2302.0, "entropy": 0.031460489612072706, "epoch": 3.564000142560006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008842600509524345, "kl": 0.015722267096862197, "learning_rate": 7.4629799959616305e-06, "loss": 0.0002, "num_tokens": 25983879.0, "reward": -5.1883649826049805, "reward_std": 8.038626670837402, "rewards/rollout_reward_func/mean": -5.1883649826049805, "rewards/rollout_reward_func/std": 8.038626670837402, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.03125, "sampling/sampling_logp_difference/mean": 0.252868115901947, "step": 891, "step_time": 34.00330153097457 }, { "clip_ratio/high_max": 0.004527033626800403, "clip_ratio/high_mean": 0.004527033626800403, "clip_ratio/low_mean": 0.0029874772881157696, "clip_ratio/low_min": 0.0029874772881157696, "clip_ratio/region_mean": 0.007514510885812342, "completions/clipped_ratio": 0.0, "completions/max_length": 2606.0, "completions/max_terminated_length": 2606.0, "completions/mean_length": 2470.875, "completions/mean_terminated_length": 2470.875, "completions/min_length": 2328.0, "completions/min_terminated_length": 2328.0, "entropy": 0.030772383091971278, "epoch": 3.568000142720006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024915265385061502, "kl": 0.012395283323712647, "learning_rate": 7.462979995952177e-06, "loss": 0.0001, "num_tokens": 26036337.0, "reward": -1.8688678741455078, "reward_std": 10.293466567993164, "rewards/rollout_reward_func/mean": -1.8688678741455078, "rewards/rollout_reward_func/std": 10.29346752166748, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.5625, "sampling/sampling_logp_difference/mean": 0.2470078468322754, "step": 892, "step_time": 34.399205513997 }, { "clip_ratio/high_max": 0.0029077691142447293, "clip_ratio/high_mean": 0.0029077691142447293, "clip_ratio/low_mean": 0.005176927428692579, "clip_ratio/low_min": 0.005176927428692579, "clip_ratio/region_mean": 0.008084696484729648, "completions/clipped_ratio": 0.0, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 2441.625, "completions/mean_terminated_length": 2441.625, "completions/min_length": 2299.0, "completions/min_terminated_length": 2299.0, "entropy": 0.030874603427946568, "epoch": 3.5720001428800055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003593252506107092, "kl": 0.01472178497351706, "learning_rate": 7.4629799959427145e-06, "loss": 0.0002, "num_tokens": 26088310.0, "reward": -2.392216920852661, "reward_std": 12.749363899230957, "rewards/rollout_reward_func/mean": -2.392216920852661, "rewards/rollout_reward_func/std": 12.749364852905273, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.734378814697266, "sampling/sampling_logp_difference/mean": 0.25630712509155273, "step": 893, "step_time": 34.33385469499626 }, { "clip_ratio/high_max": 0.0019836015562759712, "clip_ratio/high_mean": 0.0019836015562759712, "clip_ratio/low_mean": 0.005536693963222206, "clip_ratio/low_min": 0.005536693963222206, "clip_ratio/region_mean": 0.0075202956213615835, "completions/clipped_ratio": 0.0, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 2379.75, "completions/mean_terminated_length": 2379.75, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "entropy": 0.030307927168905735, "epoch": 3.576000143040006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005044447723776102, "kl": 0.014421052299439907, "learning_rate": 7.4629799959332405e-06, "loss": 0.0002, "num_tokens": 26139321.0, "reward": 4.692266464233398, "reward_std": 23.337158203125, "rewards/rollout_reward_func/mean": 4.692266464233398, "rewards/rollout_reward_func/std": 23.337158203125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.59375, "sampling/sampling_logp_difference/mean": 0.24385470151901245, "step": 894, "step_time": 33.9968174140231 }, { "clip_ratio/high_max": 0.0028057548333890736, "clip_ratio/high_mean": 0.0028057548333890736, "clip_ratio/low_mean": 0.00509362670709379, "clip_ratio/low_min": 0.00509362670709379, "clip_ratio/region_mean": 0.007899381569586694, "completions/clipped_ratio": 0.0, "completions/max_length": 2600.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 2264.375, "completions/mean_terminated_length": 2264.375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.03149219835177064, "epoch": 3.5800001432000056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00669182138517499, "kl": 0.014688301831483841, "learning_rate": 7.462979995923755e-06, "loss": 0.0002, "num_tokens": 26188478.0, "reward": 10.32824420928955, "reward_std": 37.84201431274414, "rewards/rollout_reward_func/mean": 10.32824420928955, "rewards/rollout_reward_func/std": 37.84201431274414, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.78125, "sampling/sampling_logp_difference/mean": 0.25330761075019836, "step": 895, "step_time": 33.73417423100909 }, { "clip_ratio/high_max": 0.004224779142532498, "clip_ratio/high_mean": 0.004224779142532498, "clip_ratio/low_mean": 0.004275778774172068, "clip_ratio/low_min": 0.004275778774172068, "clip_ratio/region_mean": 0.008500558033119887, "completions/clipped_ratio": 0.0, "completions/max_length": 2570.0, "completions/max_terminated_length": 2570.0, "completions/mean_length": 2403.1875, "completions/mean_terminated_length": 2403.1875, "completions/min_length": 2291.0, "completions/min_terminated_length": 2291.0, "entropy": 0.0308992771897465, "epoch": 3.584000143360006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004581250716000795, "kl": 0.016314062289893627, "learning_rate": 7.462979995914259e-06, "loss": 0.0002, "num_tokens": 26239818.0, "reward": 0.2876085042953491, "reward_std": 11.60338306427002, "rewards/rollout_reward_func/mean": 0.2876085042953491, "rewards/rollout_reward_func/std": 11.60338306427002, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.125, "sampling/sampling_logp_difference/mean": 0.25707992911338806, "step": 896, "step_time": 34.447227227996336 }, { "clip_ratio/high_max": 0.004252567887306213, "clip_ratio/high_mean": 0.004252567887306213, "clip_ratio/low_mean": 0.003434939935686998, "clip_ratio/low_min": 0.003434939935686998, "clip_ratio/region_mean": 0.007687507837545127, "completions/clipped_ratio": 0.0, "completions/max_length": 2559.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 2456.375, "completions/mean_terminated_length": 2456.375, "completions/min_length": 2319.0, "completions/min_terminated_length": 2319.0, "entropy": 0.029499916592612863, "epoch": 3.588000143520006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003861779347062111, "kl": 0.013524907873943448, "learning_rate": 7.462979995904751e-06, "loss": 0.0002, "num_tokens": 26292039.0, "reward": -5.078635215759277, "reward_std": 9.89518928527832, "rewards/rollout_reward_func/mean": -5.078635215759277, "rewards/rollout_reward_func/std": 9.895191192626953, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 59.84375, "sampling/sampling_logp_difference/mean": 0.24763642251491547, "step": 897, "step_time": 33.72899505299574 }, { "clip_ratio/high_max": 0.004292988043744117, "clip_ratio/high_mean": 0.004292988043744117, "clip_ratio/low_mean": 0.004203648277325556, "clip_ratio/low_min": 0.004203648277325556, "clip_ratio/region_mean": 0.008496636291965842, "completions/clipped_ratio": 0.0, "completions/max_length": 2586.0, "completions/max_terminated_length": 2586.0, "completions/mean_length": 2309.75, "completions/mean_terminated_length": 2309.75, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.029844286385923624, "epoch": 3.5920001436800055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029559347312897444, "kl": 0.01642241154331714, "learning_rate": 7.462979995895234e-06, "loss": 0.0002, "num_tokens": 26341911.0, "reward": -0.6154816150665283, "reward_std": 21.053180694580078, "rewards/rollout_reward_func/mean": -0.6154816150665283, "rewards/rollout_reward_func/std": 21.053180694580078, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.234378814697266, "sampling/sampling_logp_difference/mean": 0.25419318675994873, "step": 898, "step_time": 33.58002213500731 }, { "clip_ratio/high_max": 0.004402356717037037, "clip_ratio/high_mean": 0.004402356717037037, "clip_ratio/low_mean": 0.003569204796804115, "clip_ratio/low_min": 0.003569204796804115, "clip_ratio/region_mean": 0.007971561572048813, "completions/clipped_ratio": 0.0, "completions/max_length": 2533.0, "completions/max_terminated_length": 2533.0, "completions/mean_length": 2408.5625, "completions/mean_terminated_length": 2408.5625, "completions/min_length": 2311.0, "completions/min_terminated_length": 2311.0, "entropy": 0.030281685991212726, "epoch": 3.596000143840006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025562394876033068, "kl": 0.014940871973522007, "learning_rate": 7.462979995885703e-06, "loss": 0.0002, "num_tokens": 26393328.0, "reward": -2.4291493892669678, "reward_std": 16.434812545776367, "rewards/rollout_reward_func/mean": -2.4291493892669678, "rewards/rollout_reward_func/std": 16.434810638427734, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.578125, "sampling/sampling_logp_difference/mean": 0.2527885437011719, "step": 899, "step_time": 33.678008404021966 }, { "clip_ratio/high_max": 0.003612152810092084, "clip_ratio/high_mean": 0.003612152810092084, "clip_ratio/low_mean": 0.004803364019608125, "clip_ratio/low_min": 0.004803364019608125, "clip_ratio/region_mean": 0.008415516756940633, "completions/clipped_ratio": 0.0, "completions/max_length": 2602.0, "completions/max_terminated_length": 2602.0, "completions/mean_length": 2303.8125, "completions/mean_terminated_length": 2303.8125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.031095201149582863, "epoch": 3.600000144000006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007575422525405884, "kl": 0.01700862986035645, "learning_rate": 7.462979995876164e-06, "loss": 0.0002, "num_tokens": 26443099.0, "reward": 3.833515167236328, "reward_std": 26.46378517150879, "rewards/rollout_reward_func/mean": 3.833515167236328, "rewards/rollout_reward_func/std": 26.463787078857422, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.5273551940918, "sampling/sampling_logp_difference/mean": 0.25232234597206116, "step": 900, "step_time": 33.75145932599844 }, { "clip_ratio/high_max": 0.0054518444812856615, "clip_ratio/high_mean": 0.0054518444812856615, "clip_ratio/low_mean": 0.002709536493057385, "clip_ratio/low_min": 0.002709536493057385, "clip_ratio/region_mean": 0.008161380887031555, "completions/clipped_ratio": 0.0, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 2348.625, "completions/mean_terminated_length": 2348.625, "completions/min_length": 1571.0, "completions/min_terminated_length": 1571.0, "entropy": 0.032074387883767486, "epoch": 3.6040001441600054e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005896084941923618, "kl": 0.016215358045883477, "learning_rate": 7.462979995866613e-06, "loss": 0.0002, "num_tokens": 26493560.0, "reward": 3.7652149200439453, "reward_std": 34.523990631103516, "rewards/rollout_reward_func/mean": 3.7652149200439453, "rewards/rollout_reward_func/std": 34.52399444580078, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.968875885009766, "sampling/sampling_logp_difference/mean": 0.2547803521156311, "step": 901, "step_time": 33.495650522003416 }, { "clip_ratio/high_max": 0.004454433830687776, "clip_ratio/high_mean": 0.004454433830687776, "clip_ratio/low_mean": 0.003383150731679052, "clip_ratio/low_min": 0.003383150731679052, "clip_ratio/region_mean": 0.007837584416847676, "completions/clipped_ratio": 0.0, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 2414.0625, "completions/mean_terminated_length": 2414.0625, "completions/min_length": 2112.0, "completions/min_terminated_length": 2112.0, "entropy": 0.03038722090423107, "epoch": 3.608000144320006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003074022475630045, "kl": 0.014145763823762536, "learning_rate": 7.46297999585705e-06, "loss": 0.0002, "num_tokens": 26545102.0, "reward": 7.1350250244140625, "reward_std": 33.54043960571289, "rewards/rollout_reward_func/mean": 7.1350250244140625, "rewards/rollout_reward_func/std": 33.540443420410156, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.625, "sampling/sampling_logp_difference/mean": 0.26026034355163574, "step": 902, "step_time": 34.447278457999346 }, { "clip_ratio/high_max": 0.0026318003656342626, "clip_ratio/high_mean": 0.0026318003656342626, "clip_ratio/low_mean": 0.0047022846702020615, "clip_ratio/low_min": 0.0047022846702020615, "clip_ratio/region_mean": 0.007334085006732494, "completions/clipped_ratio": 0.0, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 2497.1875, "completions/mean_terminated_length": 2497.1875, "completions/min_length": 2419.0, "completions/min_terminated_length": 2419.0, "entropy": 0.030074414098635316, "epoch": 3.6120001444800056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004463964607566595, "kl": 0.014151728362776339, "learning_rate": 7.462979995847477e-06, "loss": 0.0002, "num_tokens": 26597975.0, "reward": -1.0691726207733154, "reward_std": 12.77005386352539, "rewards/rollout_reward_func/mean": -1.0691726207733154, "rewards/rollout_reward_func/std": 12.77005386352539, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.10548782348633, "sampling/sampling_logp_difference/mean": 0.24466682970523834, "step": 903, "step_time": 34.57330868000281 }, { "clip_ratio/high_max": 0.002487271325662732, "clip_ratio/high_mean": 0.002487271325662732, "clip_ratio/low_mean": 0.005881516408408061, "clip_ratio/low_min": 0.005881516408408061, "clip_ratio/region_mean": 0.008368787646759301, "completions/clipped_ratio": 0.0, "completions/max_length": 2531.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 2241.0625, "completions/mean_terminated_length": 2241.0625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.0313652204349637, "epoch": 3.616000144640006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009263883344829082, "kl": 0.017587301903404295, "learning_rate": 7.462979995837892e-06, "loss": 0.0002, "num_tokens": 26646711.0, "reward": 4.697998046875, "reward_std": 30.794475555419922, "rewards/rollout_reward_func/mean": 4.697998046875, "rewards/rollout_reward_func/std": 30.794477462768555, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.0078125, "sampling/sampling_logp_difference/mean": 0.25485146045684814, "step": 904, "step_time": 32.779766252977424 }, { "clip_ratio/high_max": 0.0024371712934225798, "clip_ratio/high_mean": 0.0024371712934225798, "clip_ratio/low_mean": 0.00511493647354655, "clip_ratio/low_min": 0.00511493647354655, "clip_ratio/region_mean": 0.007552107737865299, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 2388.9375, "completions/mean_terminated_length": 2388.9375, "completions/min_length": 2288.0, "completions/min_terminated_length": 2288.0, "entropy": 0.03162275557406247, "epoch": 3.620000144800006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00450235977768898, "kl": 0.014501720783300698, "learning_rate": 7.462979995828296e-06, "loss": 0.0002, "num_tokens": 26697824.0, "reward": -3.3291475772857666, "reward_std": 12.281828880310059, "rewards/rollout_reward_func/mean": -3.3291475772857666, "rewards/rollout_reward_func/std": 12.281829833984375, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.59375, "sampling/sampling_logp_difference/mean": 0.2502705752849579, "step": 905, "step_time": 34.002082977996906 }, { "clip_ratio/high_max": 0.002823318383889273, "clip_ratio/high_mean": 0.002823318383889273, "clip_ratio/low_mean": 0.00434564525494352, "clip_ratio/low_min": 0.00434564525494352, "clip_ratio/region_mean": 0.007168963609728962, "completions/clipped_ratio": 0.0, "completions/max_length": 2565.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 2454.375, "completions/mean_terminated_length": 2454.375, "completions/min_length": 2392.0, "completions/min_terminated_length": 2392.0, "entropy": 0.031200740253552794, "epoch": 3.6240001449600055e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01515157800167799, "kl": 0.0158424258697778, "learning_rate": 7.4629799958186894e-06, "loss": 0.0002, "num_tokens": 26750009.0, "reward": -4.107669353485107, "reward_std": 9.616460800170898, "rewards/rollout_reward_func/mean": -4.107669353485107, "rewards/rollout_reward_func/std": 9.616461753845215, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.11328125, "sampling/sampling_logp_difference/mean": 0.2443024069070816, "step": 906, "step_time": 34.37816715001827 }, { "clip_ratio/high_max": 0.004578453488647938, "clip_ratio/high_mean": 0.004578453488647938, "clip_ratio/low_mean": 0.00372472865274176, "clip_ratio/low_min": 0.00372472865274176, "clip_ratio/region_mean": 0.008303182199597359, "completions/clipped_ratio": 0.0, "completions/max_length": 2573.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 2449.5, "completions/mean_terminated_length": 2449.5, "completions/min_length": 2314.0, "completions/min_terminated_length": 2314.0, "entropy": 0.030089016538113356, "epoch": 3.628000145120006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007105086464434862, "kl": 0.015255560749210417, "learning_rate": 7.462979995809071e-06, "loss": 0.0002, "num_tokens": 26802128.0, "reward": -1.5074870586395264, "reward_std": 15.568296432495117, "rewards/rollout_reward_func/mean": -1.5074870586395264, "rewards/rollout_reward_func/std": 15.568296432495117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.84375, "sampling/sampling_logp_difference/mean": 0.24553944170475006, "step": 907, "step_time": 34.157126178994076 }, { "clip_ratio/high_max": 0.0033911501232068986, "clip_ratio/high_mean": 0.0033911501232068986, "clip_ratio/low_mean": 0.00532883673440665, "clip_ratio/low_min": 0.00532883673440665, "clip_ratio/region_mean": 0.008719986828509718, "completions/clipped_ratio": 0.0, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 2263.0625, "completions/mean_terminated_length": 2263.0625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.03076833044178784, "epoch": 3.632000145280006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024762554094195366, "kl": 0.01466081920079887, "learning_rate": 7.462979995799442e-06, "loss": 0.0002, "num_tokens": 26851215.0, "reward": 3.37099289894104, "reward_std": 26.55135726928711, "rewards/rollout_reward_func/mean": 3.37099289894104, "rewards/rollout_reward_func/std": 26.551361083984375, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.859378814697266, "sampling/sampling_logp_difference/mean": 0.255733847618103, "step": 908, "step_time": 33.197783417999744 }, { "clip_ratio/high_max": 0.0034459960879758, "clip_ratio/high_mean": 0.0034459960879758, "clip_ratio/low_mean": 0.004044940636958927, "clip_ratio/low_min": 0.004044940636958927, "clip_ratio/region_mean": 0.007490936841350049, "completions/clipped_ratio": 0.0, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 2494.0625, "completions/mean_terminated_length": 2494.0625, "completions/min_length": 2406.0, "completions/min_terminated_length": 2406.0, "entropy": 0.02968406956642866, "epoch": 3.636000145440006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00624583987519145, "kl": 0.01415251987054944, "learning_rate": 7.462979995789802e-06, "loss": 0.0002, "num_tokens": 26904069.0, "reward": -3.9663679599761963, "reward_std": 8.146526336669922, "rewards/rollout_reward_func/mean": -3.9663679599761963, "rewards/rollout_reward_func/std": 8.146527290344238, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.3125, "sampling/sampling_logp_difference/mean": 0.24437671899795532, "step": 909, "step_time": 34.369661510019796 }, { "clip_ratio/high_max": 0.0035259145661257207, "clip_ratio/high_mean": 0.0035259145661257207, "clip_ratio/low_mean": 0.0034154485911130905, "clip_ratio/low_min": 0.0034154485911130905, "clip_ratio/region_mean": 0.006941363215446472, "completions/clipped_ratio": 0.0, "completions/max_length": 2592.0, "completions/max_terminated_length": 2592.0, "completions/mean_length": 2458.8125, "completions/mean_terminated_length": 2458.8125, "completions/min_length": 2310.0, "completions/min_terminated_length": 2310.0, "entropy": 0.03007325460202992, "epoch": 3.640000145600006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.1551528126001358, "kl": 0.02488237083889544, "learning_rate": 7.462979995780152e-06, "loss": 0.0003, "num_tokens": 26956318.0, "reward": -1.6152410507202148, "reward_std": 8.938907623291016, "rewards/rollout_reward_func/mean": -1.6152410507202148, "rewards/rollout_reward_func/std": 8.938908576965332, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.011722564697266, "sampling/sampling_logp_difference/mean": 0.25293704867362976, "step": 910, "step_time": 34.4071796679782 }, { "clip_ratio/high_max": 0.0035940175293944776, "clip_ratio/high_mean": 0.0035940175293944776, "clip_ratio/low_mean": 0.004054041957715526, "clip_ratio/low_min": 0.004054041957715526, "clip_ratio/region_mean": 0.007648059574421495, "completions/clipped_ratio": 0.0, "completions/max_length": 2560.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 2433.1875, "completions/mean_terminated_length": 2433.1875, "completions/min_length": 2266.0, "completions/min_terminated_length": 2266.0, "entropy": 0.031410258961841464, "epoch": 3.6440001457600056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034950347617268562, "kl": 0.014148782938718796, "learning_rate": 7.46297999577049e-06, "loss": 0.0002, "num_tokens": 27008179.0, "reward": -2.2960331439971924, "reward_std": 13.52954387664795, "rewards/rollout_reward_func/mean": -2.2960331439971924, "rewards/rollout_reward_func/std": 13.529542922973633, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.0625, "sampling/sampling_logp_difference/mean": 0.25785934925079346, "step": 911, "step_time": 34.367592890994274 }, { "clip_ratio/high_max": 0.004768425191286951, "clip_ratio/high_mean": 0.004768425191286951, "clip_ratio/low_mean": 0.003912269341526553, "clip_ratio/low_min": 0.003912269341526553, "clip_ratio/region_mean": 0.008680694445502013, "completions/clipped_ratio": 0.0, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 2452.875, "completions/mean_terminated_length": 2452.875, "completions/min_length": 2345.0, "completions/min_terminated_length": 2345.0, "entropy": 0.029614276019856334, "epoch": 3.648000145920006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004338265396654606, "kl": 0.015332489041611552, "learning_rate": 7.462979995760818e-06, "loss": 0.0002, "num_tokens": 27060350.0, "reward": -4.081684112548828, "reward_std": 11.108400344848633, "rewards/rollout_reward_func/mean": -4.081684112548828, "rewards/rollout_reward_func/std": 11.10840129852295, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.9375, "sampling/sampling_logp_difference/mean": 0.2500767409801483, "step": 912, "step_time": 34.18431113898987 }, { "clip_ratio/high_max": 0.0018337846267968416, "clip_ratio/high_mean": 0.0018337846267968416, "clip_ratio/low_mean": 0.005076239904155955, "clip_ratio/low_min": 0.005076239904155955, "clip_ratio/region_mean": 0.006910024501848966, "completions/clipped_ratio": 0.0, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 2468.8125, "completions/mean_terminated_length": 2468.8125, "completions/min_length": 2389.0, "completions/min_terminated_length": 2389.0, "entropy": 0.030563271371647716, "epoch": 3.652000146080006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0050320494920015335, "kl": 0.01204371266067028, "learning_rate": 7.4629799957511335e-06, "loss": 0.0001, "num_tokens": 27112772.0, "reward": -3.099294662475586, "reward_std": 12.108461380004883, "rewards/rollout_reward_func/mean": -3.099294662475586, "rewards/rollout_reward_func/std": 12.1084623336792, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.0234375, "sampling/sampling_logp_difference/mean": 0.24508823454380035, "step": 913, "step_time": 34.250080753001384 }, { "clip_ratio/high_max": 0.003914859116775915, "clip_ratio/high_mean": 0.003914859116775915, "clip_ratio/low_mean": 0.00408912607235834, "clip_ratio/low_min": 0.00408912607235834, "clip_ratio/region_mean": 0.008003985160030425, "completions/clipped_ratio": 0.0, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 2323.75, "completions/mean_terminated_length": 2323.75, "completions/min_length": 2084.0, "completions/min_terminated_length": 2084.0, "entropy": 0.033338037552312016, "epoch": 3.656000146240006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002380275633186102, "kl": 0.013020400772802532, "learning_rate": 7.4629799957414375e-06, "loss": 0.0001, "num_tokens": 27162812.0, "reward": 1.1697701215744019, "reward_std": 14.118247985839844, "rewards/rollout_reward_func/mean": 1.1697701215744019, "rewards/rollout_reward_func/std": 14.11824893951416, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.46875, "sampling/sampling_logp_difference/mean": 0.2734869420528412, "step": 914, "step_time": 33.779833651991794 }, { "clip_ratio/high_max": 0.003843002807116136, "clip_ratio/high_mean": 0.003843002807116136, "clip_ratio/low_mean": 0.0037260036333464086, "clip_ratio/low_min": 0.0037260036333464086, "clip_ratio/region_mean": 0.007569006411358714, "completions/clipped_ratio": 0.0, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 2453.8125, "completions/mean_terminated_length": 2453.8125, "completions/min_length": 2293.0, "completions/min_terminated_length": 2293.0, "entropy": 0.03126399638131261, "epoch": 3.660000146400006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.036377646028995514, "kl": 0.01725073577836156, "learning_rate": 7.462979995731732e-06, "loss": 0.0002, "num_tokens": 27214992.0, "reward": -5.530854225158691, "reward_std": 11.745182991027832, "rewards/rollout_reward_func/mean": -5.530854225158691, "rewards/rollout_reward_func/std": 11.745183944702148, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.53125, "sampling/sampling_logp_difference/mean": 0.25381389260292053, "step": 915, "step_time": 34.395035600988194 }, { "clip_ratio/high_max": 0.003902299460605718, "clip_ratio/high_mean": 0.003902299460605718, "clip_ratio/low_mean": 0.004614387813489884, "clip_ratio/low_min": 0.004614387813489884, "clip_ratio/region_mean": 0.008516687201336026, "completions/clipped_ratio": 0.0, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 2275.4375, "completions/mean_terminated_length": 2275.4375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "entropy": 0.03128737001679838, "epoch": 3.6640001465600056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004870665725320578, "kl": 0.013799920096062124, "learning_rate": 7.462979995722015e-06, "loss": 0.0002, "num_tokens": 27264291.0, "reward": -2.8859007358551025, "reward_std": 27.461532592773438, "rewards/rollout_reward_func/mean": -2.8859007358551025, "rewards/rollout_reward_func/std": 27.46153450012207, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.75, "sampling/sampling_logp_difference/mean": 0.24911555647850037, "step": 916, "step_time": 33.163889397037565 }, { "clip_ratio/high_max": 0.003291192842880264, "clip_ratio/high_mean": 0.003291192842880264, "clip_ratio/low_mean": 0.004158027149969712, "clip_ratio/low_min": 0.004158027149969712, "clip_ratio/region_mean": 0.007449219934642315, "completions/clipped_ratio": 0.0, "completions/max_length": 2566.0, "completions/max_terminated_length": 2566.0, "completions/mean_length": 2288.6875, "completions/mean_terminated_length": 2288.6875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.031271074898540974, "epoch": 3.668000146720006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037579869385808706, "kl": 0.013200630201026797, "learning_rate": 7.462979995712287e-06, "loss": 0.0001, "num_tokens": 27313814.0, "reward": -1.0835306644439697, "reward_std": 21.2847843170166, "rewards/rollout_reward_func/mean": -1.0835306644439697, "rewards/rollout_reward_func/std": 21.284786224365234, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 61.0625, "sampling/sampling_logp_difference/mean": 0.2482854425907135, "step": 917, "step_time": 33.58120823701029 }, { "clip_ratio/high_max": 0.003554695751518011, "clip_ratio/high_mean": 0.003554695751518011, "clip_ratio/low_mean": 0.003674548614071682, "clip_ratio/low_min": 0.003674548614071682, "clip_ratio/region_mean": 0.007229244278278202, "completions/clipped_ratio": 0.0, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 2431.3125, "completions/mean_terminated_length": 2431.3125, "completions/min_length": 2331.0, "completions/min_terminated_length": 2331.0, "entropy": 0.03053121385164559, "epoch": 3.672000146880006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004467049613595009, "kl": 0.014285293989814818, "learning_rate": 7.462979995702547e-06, "loss": 0.0002, "num_tokens": 27365617.0, "reward": -3.518479824066162, "reward_std": 9.826892852783203, "rewards/rollout_reward_func/mean": -3.518479824066162, "rewards/rollout_reward_func/std": 9.826891899108887, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.3125, "sampling/sampling_logp_difference/mean": 0.2535712718963623, "step": 918, "step_time": 34.42358642601175 }, { "clip_ratio/high_max": 0.0026313107227906585, "clip_ratio/high_mean": 0.0026313107227906585, "clip_ratio/low_mean": 0.005088463134597987, "clip_ratio/low_min": 0.005088463134597987, "clip_ratio/region_mean": 0.007719773740973324, "completions/clipped_ratio": 0.0, "completions/max_length": 2592.0, "completions/max_terminated_length": 2592.0, "completions/mean_length": 2274.3125, "completions/mean_terminated_length": 2274.3125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.03137285588309169, "epoch": 3.676000147040006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004403363447636366, "kl": 0.015335796633735299, "learning_rate": 7.462979995692797e-06, "loss": 0.0002, "num_tokens": 27414898.0, "reward": 7.542398452758789, "reward_std": 27.30243682861328, "rewards/rollout_reward_func/mean": 7.542398452758789, "rewards/rollout_reward_func/std": 27.302440643310547, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.90625, "sampling/sampling_logp_difference/mean": 0.26129773259162903, "step": 919, "step_time": 34.0262064270064 }, { "clip_ratio/high_max": 0.0034499509492889047, "clip_ratio/high_mean": 0.0034499509492889047, "clip_ratio/low_mean": 0.004846106952754781, "clip_ratio/low_min": 0.004846106952754781, "clip_ratio/region_mean": 0.008296057872939855, "completions/clipped_ratio": 0.0, "completions/max_length": 2463.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 2308.1875, "completions/mean_terminated_length": 2308.1875, "completions/min_length": 1283.0, "completions/min_terminated_length": 1283.0, "entropy": 0.032018003752455115, "epoch": 3.680000147200006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004334242083132267, "kl": 0.015516105340793729, "learning_rate": 7.462979995683036e-06, "loss": 0.0002, "num_tokens": 27464720.0, "reward": -2.155129909515381, "reward_std": 27.261354446411133, "rewards/rollout_reward_func/mean": -2.155129909515381, "rewards/rollout_reward_func/std": 27.261350631713867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.5, "sampling/sampling_logp_difference/mean": 0.2586005628108978, "step": 920, "step_time": 33.54725215300277 }, { "clip_ratio/high_max": 0.0043088471502414905, "clip_ratio/high_mean": 0.0043088471502414905, "clip_ratio/low_mean": 0.003557219111826271, "clip_ratio/low_min": 0.003557219111826271, "clip_ratio/region_mean": 0.007866066240239888, "completions/clipped_ratio": 0.0, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 2261.9375, "completions/mean_terminated_length": 2261.9375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.03153191599994898, "epoch": 3.684000147360006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027248940896242857, "kl": 0.015404654550366104, "learning_rate": 7.462979995673263e-06, "loss": 0.0002, "num_tokens": 27513783.0, "reward": 2.120358943939209, "reward_std": 29.011964797973633, "rewards/rollout_reward_func/mean": 2.120358943939209, "rewards/rollout_reward_func/std": 29.011962890625, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.03125, "sampling/sampling_logp_difference/mean": 0.25693315267562866, "step": 921, "step_time": 33.33882504198118 }, { "clip_ratio/high_max": 0.0041350201645400375, "clip_ratio/high_mean": 0.0041350201645400375, "clip_ratio/low_mean": 0.00375013129087165, "clip_ratio/low_min": 0.00375013129087165, "clip_ratio/region_mean": 0.007885151368100196, "completions/clipped_ratio": 0.0, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 2428.1875, "completions/mean_terminated_length": 2428.1875, "completions/min_length": 2318.0, "completions/min_terminated_length": 2318.0, "entropy": 0.03171693813055754, "epoch": 3.688000147520006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004032057244330645, "kl": 0.015070343855768442, "learning_rate": 7.46297999566348e-06, "loss": 0.0002, "num_tokens": 27565527.0, "reward": -4.884790420532227, "reward_std": 12.968335151672363, "rewards/rollout_reward_func/mean": -4.884790420532227, "rewards/rollout_reward_func/std": 12.96833610534668, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.40625, "sampling/sampling_logp_difference/mean": 0.2487853765487671, "step": 922, "step_time": 34.449104434999754 }, { "clip_ratio/high_max": 0.004066875495482236, "clip_ratio/high_mean": 0.004066875495482236, "clip_ratio/low_mean": 0.0044506717240437865, "clip_ratio/low_min": 0.0044506717240437865, "clip_ratio/region_mean": 0.008517547277733684, "completions/clipped_ratio": 0.0, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 2393.5625, "completions/mean_terminated_length": 2393.5625, "completions/min_length": 2278.0, "completions/min_terminated_length": 2278.0, "entropy": 0.029989304253831506, "epoch": 3.692000147680006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037378263659775257, "kl": 0.015377734089270234, "learning_rate": 7.462979995653685e-06, "loss": 0.0002, "num_tokens": 27616695.0, "reward": -2.560098171234131, "reward_std": 9.902878761291504, "rewards/rollout_reward_func/mean": -2.560098171234131, "rewards/rollout_reward_func/std": 9.902878761291504, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.578125, "sampling/sampling_logp_difference/mean": 0.2669619619846344, "step": 923, "step_time": 34.63225643000624 }, { "clip_ratio/high_max": 0.0023303628258872777, "clip_ratio/high_mean": 0.0023303628258872777, "clip_ratio/low_mean": 0.005730381060857326, "clip_ratio/low_min": 0.005730381060857326, "clip_ratio/region_mean": 0.008060743915848434, "completions/clipped_ratio": 0.0, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 2280.0625, "completions/mean_terminated_length": 2280.0625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.031417797319591045, "epoch": 3.6960001478400056e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.1243479773402214, "kl": 0.03135770501103252, "learning_rate": 7.46297999564388e-06, "loss": 0.0004, "num_tokens": 27666062.0, "reward": -1.5454401969909668, "reward_std": 19.531023025512695, "rewards/rollout_reward_func/mean": -1.5454401969909668, "rewards/rollout_reward_func/std": 19.531024932861328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.796875, "sampling/sampling_logp_difference/mean": 0.26558881998062134, "step": 924, "step_time": 33.22080015097163 }, { "clip_ratio/high_max": 0.004220190632622689, "clip_ratio/high_mean": 0.004220190632622689, "clip_ratio/low_mean": 0.00415484924451448, "clip_ratio/low_min": 0.00415484924451448, "clip_ratio/region_mean": 0.008375039906241, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 2438.3125, "completions/mean_terminated_length": 2438.3125, "completions/min_length": 2312.0, "completions/min_terminated_length": 2312.0, "entropy": 0.03181131719611585, "epoch": 3.700000148000006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025457025039941072, "kl": 0.012601040420122445, "learning_rate": 7.462979995634063e-06, "loss": 0.0001, "num_tokens": 27717989.0, "reward": 3.6405797004699707, "reward_std": 12.321174621582031, "rewards/rollout_reward_func/mean": 3.6405797004699707, "rewards/rollout_reward_func/std": 12.321174621582031, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.703125, "sampling/sampling_logp_difference/mean": 0.24663889408111572, "step": 925, "step_time": 33.78622796302079 }, { "clip_ratio/high_max": 0.004147223080508411, "clip_ratio/high_mean": 0.004147223080508411, "clip_ratio/low_mean": 0.003671068901894614, "clip_ratio/low_min": 0.003671068901894614, "clip_ratio/region_mean": 0.007818291895091534, "completions/clipped_ratio": 0.0, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 2517.8125, "completions/mean_terminated_length": 2517.8125, "completions/min_length": 2359.0, "completions/min_terminated_length": 2359.0, "entropy": 0.02811761130578816, "epoch": 3.704000148160006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034141158685088158, "kl": 0.01293194864410907, "learning_rate": 7.462979995624235e-06, "loss": 0.0002, "num_tokens": 27771228.0, "reward": -4.945171356201172, "reward_std": 8.614218711853027, "rewards/rollout_reward_func/mean": -4.945171356201172, "rewards/rollout_reward_func/std": 8.614218711853027, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.671875, "sampling/sampling_logp_difference/mean": 0.2459946572780609, "step": 926, "step_time": 34.559292913996615 }, { "clip_ratio/high_max": 0.0023313936253543943, "clip_ratio/high_mean": 0.0023313936253543943, "clip_ratio/low_mean": 0.005355662608053535, "clip_ratio/low_min": 0.005355662608053535, "clip_ratio/region_mean": 0.00768705626251176, "completions/clipped_ratio": 0.0, "completions/max_length": 2602.0, "completions/max_terminated_length": 2602.0, "completions/mean_length": 2279.625, "completions/mean_terminated_length": 2279.625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.03241957211866975, "epoch": 3.708000148320006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005813283380120993, "kl": 0.014827109524048865, "learning_rate": 7.462979995614397e-06, "loss": 0.0002, "num_tokens": 27820644.0, "reward": 19.084632873535156, "reward_std": 49.63894271850586, "rewards/rollout_reward_func/mean": 19.084632873535156, "rewards/rollout_reward_func/std": 49.638946533203125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.8828125, "sampling/sampling_logp_difference/mean": 0.25518596172332764, "step": 927, "step_time": 33.5186656639853 }, { "clip_ratio/high_max": 0.004347122798208147, "clip_ratio/high_mean": 0.004347122798208147, "clip_ratio/low_mean": 0.0031963105138856918, "clip_ratio/low_min": 0.0031963105138856918, "clip_ratio/region_mean": 0.007543433224782348, "completions/clipped_ratio": 0.0, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 2440.4375, "completions/mean_terminated_length": 2440.4375, "completions/min_length": 2310.0, "completions/min_terminated_length": 2310.0, "entropy": 0.03102425066754222, "epoch": 3.712000148480006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003760730614885688, "kl": 0.01350200385786593, "learning_rate": 7.462979995604547e-06, "loss": 0.0002, "num_tokens": 27872598.0, "reward": -3.0122017860412598, "reward_std": 10.606231689453125, "rewards/rollout_reward_func/mean": -3.0122017860412598, "rewards/rollout_reward_func/std": 10.606231689453125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.3125, "sampling/sampling_logp_difference/mean": 0.2511221170425415, "step": 928, "step_time": 33.80765382903337 }, { "clip_ratio/high_max": 0.0035371060366742313, "clip_ratio/high_mean": 0.0035371060366742313, "clip_ratio/low_mean": 0.003907463076757267, "clip_ratio/low_min": 0.003907463076757267, "clip_ratio/region_mean": 0.007444569084327668, "completions/clipped_ratio": 0.0, "completions/max_length": 2560.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 2429.3125, "completions/mean_terminated_length": 2429.3125, "completions/min_length": 2277.0, "completions/min_terminated_length": 2277.0, "entropy": 0.031834305031225085, "epoch": 3.716000148640006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026857545599341393, "kl": 0.0128998743603006, "learning_rate": 7.4629799955946865e-06, "loss": 0.0002, "num_tokens": 27924369.0, "reward": 1.166353702545166, "reward_std": 11.72590160369873, "rewards/rollout_reward_func/mean": 1.166353702545166, "rewards/rollout_reward_func/std": 11.725902557373047, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.5078125, "sampling/sampling_logp_difference/mean": 0.2542523145675659, "step": 929, "step_time": 34.34108992700931 }, { "clip_ratio/high_max": 0.003736439859494567, "clip_ratio/high_mean": 0.003736439859494567, "clip_ratio/low_mean": 0.004607674083672464, "clip_ratio/low_min": 0.004607674083672464, "clip_ratio/region_mean": 0.00834411394316703, "completions/clipped_ratio": 0.0, "completions/max_length": 2568.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 2239.8125, "completions/mean_terminated_length": 2239.8125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.031323542818427086, "epoch": 3.720000148800006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030518437270075083, "kl": 0.015381950535811484, "learning_rate": 7.462979995584814e-06, "loss": 0.0002, "num_tokens": 27973097.0, "reward": 1.1660566329956055, "reward_std": 24.77982521057129, "rewards/rollout_reward_func/mean": 1.1660566329956055, "rewards/rollout_reward_func/std": 24.77982521057129, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.125, "sampling/sampling_logp_difference/mean": 0.26459091901779175, "step": 930, "step_time": 33.6613453500031 }, { "clip_ratio/high_max": 0.003724062960827723, "clip_ratio/high_mean": 0.003724062960827723, "clip_ratio/low_mean": 0.003412866994040087, "clip_ratio/low_min": 0.003412866994040087, "clip_ratio/region_mean": 0.00713692995486781, "completions/clipped_ratio": 0.0, "completions/max_length": 2594.0, "completions/max_terminated_length": 2594.0, "completions/mean_length": 2450.375, "completions/mean_terminated_length": 2450.375, "completions/min_length": 2275.0, "completions/min_terminated_length": 2275.0, "entropy": 0.03031683200970292, "epoch": 3.724000148960006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033509063068777323, "kl": 0.014653791324235499, "learning_rate": 7.462979995574931e-06, "loss": 0.0002, "num_tokens": 28025202.0, "reward": -3.112334966659546, "reward_std": 9.655299186706543, "rewards/rollout_reward_func/mean": -3.112334966659546, "rewards/rollout_reward_func/std": 9.655299186706543, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.796875, "sampling/sampling_logp_difference/mean": 0.2502186894416809, "step": 931, "step_time": 34.43811254898901 }, { "clip_ratio/high_max": 0.0028676487563643605, "clip_ratio/high_mean": 0.0028676487563643605, "clip_ratio/low_mean": 0.0049895913689397275, "clip_ratio/low_min": 0.0049895913689397275, "clip_ratio/region_mean": 0.007857240154407918, "completions/clipped_ratio": 0.0, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 2421.8125, "completions/mean_terminated_length": 2421.8125, "completions/min_length": 2300.0, "completions/min_terminated_length": 2300.0, "entropy": 0.0313272497151047, "epoch": 3.728000149120006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.03421955183148384, "kl": 0.017786641139537096, "learning_rate": 7.462979995565037e-06, "loss": 0.0002, "num_tokens": 28076847.0, "reward": -1.0069642066955566, "reward_std": 10.756511688232422, "rewards/rollout_reward_func/mean": -1.0069642066955566, "rewards/rollout_reward_func/std": 10.756511688232422, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.0, "sampling/sampling_logp_difference/mean": 0.24370324611663818, "step": 932, "step_time": 33.74575532000745 }, { "clip_ratio/high_max": 0.004129926121095195, "clip_ratio/high_mean": 0.004129926121095195, "clip_ratio/low_mean": 0.003973119950387627, "clip_ratio/low_min": 0.003973119950387627, "clip_ratio/region_mean": 0.008103046042378992, "completions/clipped_ratio": 0.0, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 2409.5, "completions/mean_terminated_length": 2409.5, "completions/min_length": 2291.0, "completions/min_terminated_length": 2291.0, "entropy": 0.031044892501085997, "epoch": 3.732000149280006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026509463787078857, "kl": 0.013830556534230709, "learning_rate": 7.462979995555133e-06, "loss": 0.0002, "num_tokens": 28128276.0, "reward": -6.569438934326172, "reward_std": 9.400367736816406, "rewards/rollout_reward_func/mean": -6.569438934326172, "rewards/rollout_reward_func/std": 9.400367736816406, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 61.8984375, "sampling/sampling_logp_difference/mean": 0.256250262260437, "step": 933, "step_time": 34.15517627199006 }, { "clip_ratio/high_max": 0.005037958937464282, "clip_ratio/high_mean": 0.005037958937464282, "clip_ratio/low_mean": 0.0033799909870140254, "clip_ratio/low_min": 0.0033799909870140254, "clip_ratio/region_mean": 0.008417949953582138, "completions/clipped_ratio": 0.0, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 2450.5, "completions/mean_terminated_length": 2450.5, "completions/min_length": 2327.0, "completions/min_terminated_length": 2327.0, "entropy": 0.030809686286374927, "epoch": 3.736000149440006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02620209939777851, "kl": 0.021189481602050364, "learning_rate": 7.462979995545216e-06, "loss": 0.0003, "num_tokens": 28180400.0, "reward": -0.038964271545410156, "reward_std": 16.791353225708008, "rewards/rollout_reward_func/mean": -0.038964271545410156, "rewards/rollout_reward_func/std": 16.791353225708008, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.0, "sampling/sampling_logp_difference/mean": 0.24990947544574738, "step": 934, "step_time": 38.645121737004956 }, { "clip_ratio/high_max": 0.0036737805348820984, "clip_ratio/high_mean": 0.0036737805348820984, "clip_ratio/low_mean": 0.0035872394655598328, "clip_ratio/low_min": 0.0035872394655598328, "clip_ratio/region_mean": 0.007261020014993846, "completions/clipped_ratio": 0.0, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 2448.3125, "completions/mean_terminated_length": 2448.3125, "completions/min_length": 2292.0, "completions/min_terminated_length": 2292.0, "entropy": 0.03116030851379037, "epoch": 3.740000149600006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005458693951368332, "kl": 0.014516756753437221, "learning_rate": 7.462979995535289e-06, "loss": 0.0002, "num_tokens": 28232482.0, "reward": -3.272641658782959, "reward_std": 13.829086303710938, "rewards/rollout_reward_func/mean": -3.272641658782959, "rewards/rollout_reward_func/std": 13.829086303710938, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.40625, "sampling/sampling_logp_difference/mean": 0.2531263530254364, "step": 935, "step_time": 34.165102489016135 }, { "clip_ratio/high_max": 0.005124999181134626, "clip_ratio/high_mean": 0.005124999181134626, "clip_ratio/low_mean": 0.0030097228882368654, "clip_ratio/low_min": 0.0030097228882368654, "clip_ratio/region_mean": 0.00813472201116383, "completions/clipped_ratio": 0.0, "completions/max_length": 2575.0, "completions/max_terminated_length": 2575.0, "completions/mean_length": 2400.875, "completions/mean_terminated_length": 2400.875, "completions/min_length": 2275.0, "completions/min_terminated_length": 2275.0, "entropy": 0.03186303866095841, "epoch": 3.744000149760006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027210975531488657, "kl": 0.015096829854883254, "learning_rate": 7.462979995525351e-06, "loss": 0.0002, "num_tokens": 28283787.0, "reward": -4.507424354553223, "reward_std": 14.907532691955566, "rewards/rollout_reward_func/mean": -4.507424354553223, "rewards/rollout_reward_func/std": 14.907532691955566, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.421939849853516, "sampling/sampling_logp_difference/mean": 0.25668787956237793, "step": 936, "step_time": 34.26190139599203 }, { "clip_ratio/high_max": 0.0033055298117687926, "clip_ratio/high_mean": 0.0033055298117687926, "clip_ratio/low_mean": 0.005615035886876285, "clip_ratio/low_min": 0.005615035886876285, "clip_ratio/region_mean": 0.008920565596781671, "completions/clipped_ratio": 0.0, "completions/max_length": 2544.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 2288.9375, "completions/mean_terminated_length": 2288.9375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "entropy": 0.03055663756094873, "epoch": 3.748000149920006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004310943651944399, "kl": 0.01442855189088732, "learning_rate": 7.4629799955154025e-06, "loss": 0.0002, "num_tokens": 28333327.0, "reward": 3.215263843536377, "reward_std": 24.011091232299805, "rewards/rollout_reward_func/mean": 3.215263843536377, "rewards/rollout_reward_func/std": 24.011091232299805, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.6875, "sampling/sampling_logp_difference/mean": 0.24696476757526398, "step": 937, "step_time": 32.971967633013264 }, { "clip_ratio/high_max": 0.0055789947509765625, "clip_ratio/high_mean": 0.0055789947509765625, "clip_ratio/low_mean": 0.0028743969451170415, "clip_ratio/low_min": 0.0028743969451170415, "clip_ratio/region_mean": 0.008453391725197434, "completions/clipped_ratio": 0.0, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 2484.125, "completions/mean_terminated_length": 2484.125, "completions/min_length": 2324.0, "completions/min_terminated_length": 2324.0, "entropy": 0.0281898679677397, "epoch": 3.752000150080006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030364436097443104, "kl": 0.010752745380159467, "learning_rate": 7.462979995505441e-06, "loss": 0.0001, "num_tokens": 28386001.0, "reward": -8.377891540527344, "reward_std": 13.329728126525879, "rewards/rollout_reward_func/mean": -8.377891540527344, "rewards/rollout_reward_func/std": 13.329728126525879, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.09375, "sampling/sampling_logp_difference/mean": 0.24789048731327057, "step": 938, "step_time": 34.382674157008296 }, { "clip_ratio/high_max": 0.002892708813305944, "clip_ratio/high_mean": 0.002892708813305944, "clip_ratio/low_mean": 0.003775458550080657, "clip_ratio/low_min": 0.003775458550080657, "clip_ratio/region_mean": 0.006668167421594262, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 2402.4375, "completions/mean_terminated_length": 2402.4375, "completions/min_length": 2239.0, "completions/min_terminated_length": 2239.0, "entropy": 0.030656173126772046, "epoch": 3.756000150240006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007813388481736183, "kl": 0.014695187797769904, "learning_rate": 7.462979995495471e-06, "loss": 0.0002, "num_tokens": 28437350.0, "reward": 2.049506187438965, "reward_std": 11.069250106811523, "rewards/rollout_reward_func/mean": 2.049506187438965, "rewards/rollout_reward_func/std": 11.069250106811523, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.828125, "sampling/sampling_logp_difference/mean": 0.2579949200153351, "step": 939, "step_time": 33.47251467400929 }, { "clip_ratio/high_max": 0.004188219143543392, "clip_ratio/high_mean": 0.004188219143543392, "clip_ratio/low_mean": 0.003840511344606057, "clip_ratio/low_min": 0.003840511344606057, "clip_ratio/region_mean": 0.00802873051725328, "completions/clipped_ratio": 0.0, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 2397.1875, "completions/mean_terminated_length": 2397.1875, "completions/min_length": 2318.0, "completions/min_terminated_length": 2318.0, "entropy": 0.03163479408249259, "epoch": 3.760000150400006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004548822529613972, "kl": 0.012999239610508084, "learning_rate": 7.462979995485488e-06, "loss": 0.0002, "num_tokens": 28488590.0, "reward": -3.5216760635375977, "reward_std": 14.285760879516602, "rewards/rollout_reward_func/mean": -3.5216760635375977, "rewards/rollout_reward_func/std": 14.285761833190918, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.15625, "sampling/sampling_logp_difference/mean": 0.2449018657207489, "step": 940, "step_time": 34.28824226101278 }, { "clip_ratio/high_max": 0.004606866394169629, "clip_ratio/high_mean": 0.004606866394169629, "clip_ratio/low_mean": 0.003419756278162822, "clip_ratio/low_min": 0.003419756278162822, "clip_ratio/region_mean": 0.008026622701436281, "completions/clipped_ratio": 0.0, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 2485.875, "completions/mean_terminated_length": 2485.875, "completions/min_length": 2224.0, "completions/min_terminated_length": 2224.0, "entropy": 0.02972618374042213, "epoch": 3.764000150560006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004281362518668175, "kl": 0.013363054720684886, "learning_rate": 7.462979995475494e-06, "loss": 0.0002, "num_tokens": 28541303.0, "reward": -0.2996327877044678, "reward_std": 20.443798065185547, "rewards/rollout_reward_func/mean": -0.2996327877044678, "rewards/rollout_reward_func/std": 20.443798065185547, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 59.15625, "sampling/sampling_logp_difference/mean": 0.24835845828056335, "step": 941, "step_time": 34.26396947198373 }, { "clip_ratio/high_max": 0.0042825458804145455, "clip_ratio/high_mean": 0.0042825458804145455, "clip_ratio/low_mean": 0.0036902643623761833, "clip_ratio/low_min": 0.0036902643623761833, "clip_ratio/region_mean": 0.007972810126375407, "completions/clipped_ratio": 0.0, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 2406.6875, "completions/mean_terminated_length": 2406.6875, "completions/min_length": 2306.0, "completions/min_terminated_length": 2306.0, "entropy": 0.030117045855149627, "epoch": 3.768000150720006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027741629164665937, "kl": 0.011975224129855633, "learning_rate": 7.46297999546549e-06, "loss": 0.0001, "num_tokens": 28592705.0, "reward": -7.112144947052002, "reward_std": 12.586919784545898, "rewards/rollout_reward_func/mean": -7.112144947052002, "rewards/rollout_reward_func/std": 12.586919784545898, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.0, "sampling/sampling_logp_difference/mean": 0.2483857125043869, "step": 942, "step_time": 33.52698980100104 }, { "clip_ratio/high_max": 0.0030666892416775227, "clip_ratio/high_mean": 0.0030666892416775227, "clip_ratio/low_mean": 0.0054103536531329155, "clip_ratio/low_min": 0.0054103536531329155, "clip_ratio/region_mean": 0.00847704301122576, "completions/clipped_ratio": 0.0, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 2251.125, "completions/mean_terminated_length": 2251.125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.03213652758859098, "epoch": 3.772000150880006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035618001129478216, "kl": 0.012373664882034063, "learning_rate": 7.462979995455474e-06, "loss": 0.0001, "num_tokens": 28641609.0, "reward": 1.5788861513137817, "reward_std": 27.159616470336914, "rewards/rollout_reward_func/mean": 1.5788861513137817, "rewards/rollout_reward_func/std": 27.159618377685547, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.609375, "sampling/sampling_logp_difference/mean": 0.2487299144268036, "step": 943, "step_time": 33.20558157598134 }, { "clip_ratio/high_max": 0.002971545036416501, "clip_ratio/high_mean": 0.002971545036416501, "clip_ratio/low_mean": 0.004433201189385727, "clip_ratio/low_min": 0.004433201189385727, "clip_ratio/region_mean": 0.007404746196698397, "completions/clipped_ratio": 0.0, "completions/max_length": 2558.0, "completions/max_terminated_length": 2558.0, "completions/mean_length": 2454.4375, "completions/mean_terminated_length": 2454.4375, "completions/min_length": 2140.0, "completions/min_terminated_length": 2140.0, "entropy": 0.03221535193733871, "epoch": 3.776000151040006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003905080258846283, "kl": 0.012835674453526735, "learning_rate": 7.462979995445448e-06, "loss": 0.0002, "num_tokens": 28693808.0, "reward": 7.46533727645874, "reward_std": 32.47327423095703, "rewards/rollout_reward_func/mean": 7.46533727645874, "rewards/rollout_reward_func/std": 32.47327423095703, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.65625, "sampling/sampling_logp_difference/mean": 0.259292870759964, "step": 944, "step_time": 33.48992067200015 }, { "clip_ratio/high_max": 0.005411853169789538, "clip_ratio/high_mean": 0.005411853169789538, "clip_ratio/low_mean": 0.0027062621957156807, "clip_ratio/low_min": 0.0027062621957156807, "clip_ratio/region_mean": 0.00811811548192054, "completions/clipped_ratio": 0.0, "completions/max_length": 2605.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 2433.375, "completions/mean_terminated_length": 2433.375, "completions/min_length": 2295.0, "completions/min_terminated_length": 2295.0, "entropy": 0.030824911780655384, "epoch": 3.780000151200006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02965238131582737, "kl": 0.01881077536381781, "learning_rate": 7.46297999543541e-06, "loss": 0.0002, "num_tokens": 28745654.0, "reward": 0.06010770797729492, "reward_std": 16.58462905883789, "rewards/rollout_reward_func/mean": 0.06010770797729492, "rewards/rollout_reward_func/std": 16.584630966186523, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.78125, "sampling/sampling_logp_difference/mean": 0.24955905973911285, "step": 945, "step_time": 34.29880420600239 }, { "clip_ratio/high_max": 0.0033343459072057158, "clip_ratio/high_mean": 0.0033343459072057158, "clip_ratio/low_mean": 0.004617176600731909, "clip_ratio/low_min": 0.004617176600731909, "clip_ratio/region_mean": 0.007951522595249116, "completions/clipped_ratio": 0.0, "completions/max_length": 2605.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 2454.0, "completions/mean_terminated_length": 2454.0, "completions/min_length": 2239.0, "completions/min_terminated_length": 2239.0, "entropy": 0.028366157319396734, "epoch": 3.784000151360006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003699258901178837, "kl": 0.012494505965150893, "learning_rate": 7.462979995425362e-06, "loss": 0.0001, "num_tokens": 28797831.0, "reward": -3.1273884773254395, "reward_std": 15.413614273071289, "rewards/rollout_reward_func/mean": -3.1273884773254395, "rewards/rollout_reward_func/std": 15.413614273071289, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.765625, "sampling/sampling_logp_difference/mean": 0.25068992376327515, "step": 946, "step_time": 34.278432373001124 }, { "clip_ratio/high_max": 0.0037035971181467175, "clip_ratio/high_mean": 0.0037035971181467175, "clip_ratio/low_mean": 0.004061868312419392, "clip_ratio/low_min": 0.004061868312419392, "clip_ratio/region_mean": 0.007765465532429516, "completions/clipped_ratio": 0.0, "completions/max_length": 2535.0, "completions/max_terminated_length": 2535.0, "completions/mean_length": 2473.875, "completions/mean_terminated_length": 2473.875, "completions/min_length": 2383.0, "completions/min_terminated_length": 2383.0, "entropy": 0.03090908145532012, "epoch": 3.788000151520006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007478701416403055, "kl": 0.012770455563440919, "learning_rate": 7.462979995415303e-06, "loss": 0.0002, "num_tokens": 28850337.0, "reward": -0.04736042022705078, "reward_std": 12.83162784576416, "rewards/rollout_reward_func/mean": -0.04736042022705078, "rewards/rollout_reward_func/std": 12.83162784576416, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.875, "sampling/sampling_logp_difference/mean": 0.25083982944488525, "step": 947, "step_time": 33.54683804200613 }, { "clip_ratio/high_max": 0.003034129651496187, "clip_ratio/high_mean": 0.003034129651496187, "clip_ratio/low_mean": 0.004729134816443548, "clip_ratio/low_min": 0.004729134816443548, "clip_ratio/region_mean": 0.0077632644679397345, "completions/clipped_ratio": 0.0, "completions/max_length": 2556.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 2401.125, "completions/mean_terminated_length": 2401.125, "completions/min_length": 2289.0, "completions/min_terminated_length": 2289.0, "entropy": 0.030805787770077586, "epoch": 3.792000151680006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00819613877683878, "kl": 0.016124141518957913, "learning_rate": 7.462979995405232e-06, "loss": 0.0002, "num_tokens": 28901635.0, "reward": -4.5006184577941895, "reward_std": 9.17221736907959, "rewards/rollout_reward_func/mean": -4.5006184577941895, "rewards/rollout_reward_func/std": 9.17221736907959, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.980499267578125, "sampling/sampling_logp_difference/mean": 0.2566927373409271, "step": 948, "step_time": 33.361452744022245 }, { "clip_ratio/high_max": 0.003106017305981368, "clip_ratio/high_mean": 0.003106017305981368, "clip_ratio/low_mean": 0.004402175021823496, "clip_ratio/low_min": 0.004402175021823496, "clip_ratio/region_mean": 0.0075081923278048635, "completions/clipped_ratio": 0.0, "completions/max_length": 2573.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 2414.8125, "completions/mean_terminated_length": 2414.8125, "completions/min_length": 1849.0, "completions/min_terminated_length": 1849.0, "entropy": 0.032473906176164746, "epoch": 3.796000151840006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004290299490094185, "kl": 0.013719054870307446, "learning_rate": 7.462979995395151e-06, "loss": 0.0002, "num_tokens": 28953200.0, "reward": 5.150443077087402, "reward_std": 25.134075164794922, "rewards/rollout_reward_func/mean": 5.150443077087402, "rewards/rollout_reward_func/std": 25.134077072143555, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.34765625, "sampling/sampling_logp_difference/mean": 0.2577853798866272, "step": 949, "step_time": 33.79166821099352 }, { "clip_ratio/high_max": 0.004264684015652165, "clip_ratio/high_mean": 0.004264684015652165, "clip_ratio/low_mean": 0.0035156736557837576, "clip_ratio/low_min": 0.0035156736557837576, "clip_ratio/region_mean": 0.007780357671435922, "completions/clipped_ratio": 0.0, "completions/max_length": 2556.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 2295.6875, "completions/mean_terminated_length": 2295.6875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.032033616211265326, "epoch": 3.8000001520000064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004084316082298756, "kl": 0.013309777830727398, "learning_rate": 7.462979995385057e-06, "loss": 0.0002, "num_tokens": 29002827.0, "reward": 5.246494770050049, "reward_std": 24.973421096801758, "rewards/rollout_reward_func/mean": 5.246494770050049, "rewards/rollout_reward_func/std": 24.973419189453125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.25, "sampling/sampling_logp_difference/mean": 0.26017525792121887, "step": 950, "step_time": 32.76798381500703 }, { "clip_ratio/high_max": 0.00395563617348671, "clip_ratio/high_mean": 0.00395563617348671, "clip_ratio/low_mean": 0.004030834970762953, "clip_ratio/low_min": 0.004030834970762953, "clip_ratio/region_mean": 0.007986471231561154, "completions/clipped_ratio": 0.0, "completions/max_length": 2591.0, "completions/max_terminated_length": 2591.0, "completions/mean_length": 2377.8125, "completions/mean_terminated_length": 2377.8125, "completions/min_length": 1558.0, "completions/min_terminated_length": 1558.0, "entropy": 0.03131597931496799, "epoch": 3.804000152160006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013736075721681118, "kl": 0.015553482342511415, "learning_rate": 7.462979995374953e-06, "loss": 0.0002, "num_tokens": 29053794.0, "reward": 8.844197273254395, "reward_std": 36.43212890625, "rewards/rollout_reward_func/mean": 8.844197273254395, "rewards/rollout_reward_func/std": 36.432132720947266, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.3125, "sampling/sampling_logp_difference/mean": 0.2520614266395569, "step": 951, "step_time": 33.8200851460133 }, { "clip_ratio/high_max": 0.0031635448685847223, "clip_ratio/high_mean": 0.0031635448685847223, "clip_ratio/low_mean": 0.005165547830983996, "clip_ratio/low_min": 0.005165547830983996, "clip_ratio/region_mean": 0.008329092641361058, "completions/clipped_ratio": 0.0, "completions/max_length": 2520.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 2401.9375, "completions/mean_terminated_length": 2401.9375, "completions/min_length": 2160.0, "completions/min_terminated_length": 2160.0, "entropy": 0.03151252609677613, "epoch": 3.808000152320006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003775977296754718, "kl": 0.01379098987672478, "learning_rate": 7.462979995364839e-06, "loss": 0.0002, "num_tokens": 29105117.0, "reward": 4.087296485900879, "reward_std": 13.110762596130371, "rewards/rollout_reward_func/mean": 4.087296485900879, "rewards/rollout_reward_func/std": 13.110763549804688, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.765647888183594, "sampling/sampling_logp_difference/mean": 0.2576739490032196, "step": 952, "step_time": 33.155787750001764 }, { "clip_ratio/high_max": 0.004099911078810692, "clip_ratio/high_mean": 0.004099911078810692, "clip_ratio/low_mean": 0.003981550602475181, "clip_ratio/low_min": 0.003981550602475181, "clip_ratio/region_mean": 0.008081461652182043, "completions/clipped_ratio": 0.0, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 2428.5625, "completions/mean_terminated_length": 2428.5625, "completions/min_length": 2304.0, "completions/min_terminated_length": 2304.0, "entropy": 0.03185897972434759, "epoch": 3.812000152480006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003335104323923588, "kl": 0.013861046987585723, "learning_rate": 7.462979995354712e-06, "loss": 0.0002, "num_tokens": 29156863.0, "reward": -1.075303316116333, "reward_std": 14.590215682983398, "rewards/rollout_reward_func/mean": -1.075303316116333, "rewards/rollout_reward_func/std": 14.590216636657715, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.9375, "sampling/sampling_logp_difference/mean": 0.2544889748096466, "step": 953, "step_time": 34.02973687798658 }, { "clip_ratio/high_max": 0.004477216338273138, "clip_ratio/high_mean": 0.004477216338273138, "clip_ratio/low_mean": 0.0041235561948269606, "clip_ratio/low_min": 0.0041235561948269606, "clip_ratio/region_mean": 0.008600772533100098, "completions/clipped_ratio": 0.0, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 2512.3125, "completions/mean_terminated_length": 2512.3125, "completions/min_length": 2423.0, "completions/min_terminated_length": 2423.0, "entropy": 0.0300284915138036, "epoch": 3.816000152640006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035747888032346964, "kl": 0.011866794782690704, "learning_rate": 7.4629799953445755e-06, "loss": 0.0001, "num_tokens": 29210009.0, "reward": -5.77065372467041, "reward_std": 11.015703201293945, "rewards/rollout_reward_func/mean": -5.77065372467041, "rewards/rollout_reward_func/std": 11.015703201293945, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.640625, "sampling/sampling_logp_difference/mean": 0.2444508820772171, "step": 954, "step_time": 34.39741863198287 }, { "clip_ratio/high_max": 0.0021080823280499317, "clip_ratio/high_mean": 0.0021080823280499317, "clip_ratio/low_mean": 0.005734396807383746, "clip_ratio/low_min": 0.005734396807383746, "clip_ratio/region_mean": 0.00784247909905389, "completions/clipped_ratio": 0.0, "completions/max_length": 2573.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 2371.5, "completions/mean_terminated_length": 2371.5, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.03019426646642387, "epoch": 3.8200001528000064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004082475323230028, "kl": 0.012526804464869201, "learning_rate": 7.462979995334427e-06, "loss": 0.0001, "num_tokens": 29260899.0, "reward": 3.741346836090088, "reward_std": 21.99573516845703, "rewards/rollout_reward_func/mean": 3.741346836090088, "rewards/rollout_reward_func/std": 21.9957332611084, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.984375, "sampling/sampling_logp_difference/mean": 0.2480447143316269, "step": 955, "step_time": 33.506554962994414 }, { "clip_ratio/high_max": 0.0032912753813434392, "clip_ratio/high_mean": 0.0032912753813434392, "clip_ratio/low_mean": 0.005222206993494183, "clip_ratio/low_min": 0.005222206993494183, "clip_ratio/region_mean": 0.008513482462149113, "completions/clipped_ratio": 0.0, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 2403.0625, "completions/mean_terminated_length": 2403.0625, "completions/min_length": 2299.0, "completions/min_terminated_length": 2299.0, "entropy": 0.03198603028431535, "epoch": 3.824000152960006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006357620004564524, "kl": 0.012653471669182181, "learning_rate": 7.462979995324269e-06, "loss": 0.0001, "num_tokens": 29312234.0, "reward": -4.346739768981934, "reward_std": 10.691166877746582, "rewards/rollout_reward_func/mean": -4.346739768981934, "rewards/rollout_reward_func/std": 10.691166877746582, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.171875, "sampling/sampling_logp_difference/mean": 0.25658464431762695, "step": 956, "step_time": 33.35780627699569 }, { "clip_ratio/high_max": 0.0023461593373212963, "clip_ratio/high_mean": 0.0023461593373212963, "clip_ratio/low_mean": 0.0047596184886060655, "clip_ratio/low_min": 0.0047596184886060655, "clip_ratio/region_mean": 0.007105777796823531, "completions/clipped_ratio": 0.0, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 2424.5, "completions/mean_terminated_length": 2424.5, "completions/min_length": 2010.0, "completions/min_terminated_length": 2010.0, "entropy": 0.03130925470031798, "epoch": 3.828000153120006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031049216631799936, "kl": 0.012148331617936492, "learning_rate": 7.4629799953140975e-06, "loss": 0.0001, "num_tokens": 29363941.0, "reward": 0.7611620426177979, "reward_std": 20.46497344970703, "rewards/rollout_reward_func/mean": 0.7611620426177979, "rewards/rollout_reward_func/std": 20.46497344970703, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.158203125, "sampling/sampling_logp_difference/mean": 0.2600471079349518, "step": 957, "step_time": 34.14959200999874 }, { "clip_ratio/high_max": 0.0025166570776491426, "clip_ratio/high_mean": 0.0025166570776491426, "clip_ratio/low_mean": 0.0057097334356512874, "clip_ratio/low_min": 0.0057097334356512874, "clip_ratio/region_mean": 0.008226390578784049, "completions/clipped_ratio": 0.0, "completions/max_length": 2447.0, "completions/max_terminated_length": 2447.0, "completions/mean_length": 2217.875, "completions/mean_terminated_length": 2217.875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.03131412249058485, "epoch": 3.832000153280006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005123734474182129, "kl": 0.014671992976218462, "learning_rate": 7.462979995303916e-06, "loss": 0.0002, "num_tokens": 29412298.0, "reward": 4.1404008865356445, "reward_std": 22.018253326416016, "rewards/rollout_reward_func/mean": 4.1404008865356445, "rewards/rollout_reward_func/std": 22.018253326416016, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.90625, "sampling/sampling_logp_difference/mean": 0.2647431790828705, "step": 958, "step_time": 32.930438843002776 }, { "clip_ratio/high_max": 0.0037497247685678303, "clip_ratio/high_mean": 0.0037497247685678303, "clip_ratio/low_mean": 0.0038679866993334144, "clip_ratio/low_min": 0.0038679866993334144, "clip_ratio/region_mean": 0.007617711497005075, "completions/clipped_ratio": 0.0, "completions/max_length": 2550.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 2383.8125, "completions/mean_terminated_length": 2383.8125, "completions/min_length": 1758.0, "completions/min_terminated_length": 1758.0, "entropy": 0.030898389173671603, "epoch": 3.836000153440006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009579666890203953, "kl": 0.013942748075351119, "learning_rate": 7.4629799952937255e-06, "loss": 0.0002, "num_tokens": 29463340.0, "reward": 9.647138595581055, "reward_std": 26.22690200805664, "rewards/rollout_reward_func/mean": 9.647138595581055, "rewards/rollout_reward_func/std": 26.22690200805664, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.078125, "sampling/sampling_logp_difference/mean": 0.2574395537376404, "step": 959, "step_time": 33.497273397035315 }, { "clip_ratio/high_max": 0.004040959465783089, "clip_ratio/high_mean": 0.004040959465783089, "clip_ratio/low_mean": 0.003670273465104401, "clip_ratio/low_min": 0.003670273465104401, "clip_ratio/region_mean": 0.0077112329308874905, "completions/clipped_ratio": 0.0, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 2490.0, "completions/mean_terminated_length": 2490.0, "completions/min_length": 2308.0, "completions/min_terminated_length": 2308.0, "entropy": 0.029924833681434393, "epoch": 3.8400001536000065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002669406123459339, "kl": 0.012387809692882001, "learning_rate": 7.4629799952835205e-06, "loss": 0.0002, "num_tokens": 29516124.0, "reward": -1.823009729385376, "reward_std": 11.67811107635498, "rewards/rollout_reward_func/mean": -1.823009729385376, "rewards/rollout_reward_func/std": 11.67811107635498, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.5, "sampling/sampling_logp_difference/mean": 0.24390217661857605, "step": 960, "step_time": 34.42619237299368 }, { "clip_ratio/high_max": 0.003647423494840041, "clip_ratio/high_mean": 0.003647423494840041, "clip_ratio/low_mean": 0.003855456510791555, "clip_ratio/low_min": 0.003855456510791555, "clip_ratio/region_mean": 0.007502879947423935, "completions/clipped_ratio": 0.0, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 2729.875, "completions/mean_terminated_length": 2729.875, "completions/min_length": 2555.0, "completions/min_terminated_length": 2555.0, "entropy": 0.031420600367709994, "epoch": 3.844000153760006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028886855579912663, "kl": 0.012164155021309853, "learning_rate": 7.462979995273307e-06, "loss": 0.0002, "num_tokens": 29572708.0, "reward": -6.959075450897217, "reward_std": 6.20644998550415, "rewards/rollout_reward_func/mean": -6.959075450897217, "rewards/rollout_reward_func/std": 6.20644998550415, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.03125, "sampling/sampling_logp_difference/mean": 0.24058003723621368, "step": 961, "step_time": 38.15351220501179 }, { "clip_ratio/high_max": 0.0021130117020220496, "clip_ratio/high_mean": 0.0021130117020220496, "clip_ratio/low_mean": 0.005121309397509322, "clip_ratio/low_min": 0.005121309397509322, "clip_ratio/region_mean": 0.0072343210922554135, "completions/clipped_ratio": 0.0, "completions/max_length": 2918.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 2426.5, "completions/mean_terminated_length": 2426.5, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.030208351789042354, "epoch": 3.848000153920006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005863660480827093, "kl": 0.01326483371667564, "learning_rate": 7.4629799952630816e-06, "loss": 0.0002, "num_tokens": 29624442.0, "reward": 1.4234910011291504, "reward_std": 35.656700134277344, "rewards/rollout_reward_func/mean": 1.4234910011291504, "rewards/rollout_reward_func/std": 35.656700134277344, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.28125, "sampling/sampling_logp_difference/mean": 0.249191552400589, "step": 962, "step_time": 36.19884070800617 }, { "clip_ratio/high_max": 0.004347085225163028, "clip_ratio/high_mean": 0.004347085225163028, "clip_ratio/low_mean": 0.003734077006811276, "clip_ratio/low_min": 0.003734077006811276, "clip_ratio/region_mean": 0.008081162290181965, "completions/clipped_ratio": 0.0, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 2527.375, "completions/mean_terminated_length": 2527.375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.029154757503420115, "epoch": 3.8520001540800064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027234593871980906, "kl": 0.012670975993387401, "learning_rate": 7.462979995252846e-06, "loss": 0.0002, "num_tokens": 29677761.0, "reward": -3.8120226860046387, "reward_std": 27.01439666748047, "rewards/rollout_reward_func/mean": -3.8120226860046387, "rewards/rollout_reward_func/std": 27.01439666748047, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 60.453125, "sampling/sampling_logp_difference/mean": 0.24330973625183105, "step": 963, "step_time": 36.715039403992705 }, { "clip_ratio/high_max": 0.00402277332614176, "clip_ratio/high_mean": 0.00402277332614176, "clip_ratio/low_mean": 0.0034852875105571, "clip_ratio/low_min": 0.0034852875105571, "clip_ratio/region_mean": 0.007508060894906521, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 2738.25, "completions/mean_terminated_length": 2738.25, "completions/min_length": 2550.0, "completions/min_terminated_length": 2550.0, "entropy": 0.02946972381323576, "epoch": 3.856000154240006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00527904462069273, "kl": 0.01278847292996943, "learning_rate": 7.462979995242598e-06, "loss": 0.0002, "num_tokens": 29734473.0, "reward": -7.329954624176025, "reward_std": 10.31701946258545, "rewards/rollout_reward_func/mean": -7.329954624176025, "rewards/rollout_reward_func/std": 10.317020416259766, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.25, "sampling/sampling_logp_difference/mean": 0.25131601095199585, "step": 964, "step_time": 38.26298594697437 }, { "clip_ratio/high_max": 0.004059594037244096, "clip_ratio/high_mean": 0.004059594037244096, "clip_ratio/low_mean": 0.0039754358876962215, "clip_ratio/low_min": 0.0039754358876962215, "clip_ratio/region_mean": 0.008035029866732657, "completions/clipped_ratio": 0.0, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 2509.625, "completions/mean_terminated_length": 2509.625, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.030991116305813193, "epoch": 3.860000154400006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.08539566397666931, "kl": 0.015832745586521924, "learning_rate": 7.462979995232339e-06, "loss": 0.0002, "num_tokens": 29787508.0, "reward": -0.5490190982818604, "reward_std": 26.063098907470703, "rewards/rollout_reward_func/mean": -0.5490190982818604, "rewards/rollout_reward_func/std": 26.06309700012207, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.21875, "sampling/sampling_logp_difference/mean": 0.2596321105957031, "step": 965, "step_time": 37.186264297008165 }, { "clip_ratio/high_max": 0.0034898979356512427, "clip_ratio/high_mean": 0.0034898979356512427, "clip_ratio/low_mean": 0.004161673248745501, "clip_ratio/low_min": 0.004161673248745501, "clip_ratio/region_mean": 0.007651571184396744, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 2725.1875, "completions/mean_terminated_length": 2725.1875, "completions/min_length": 2539.0, "completions/min_terminated_length": 2539.0, "entropy": 0.028486072551459074, "epoch": 3.864000154560006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004686338827013969, "kl": 0.014051456935703754, "learning_rate": 7.462979995222069e-06, "loss": 0.0002, "num_tokens": 29844011.0, "reward": -10.1980619430542, "reward_std": 13.679576873779297, "rewards/rollout_reward_func/mean": -10.1980619430542, "rewards/rollout_reward_func/std": 13.67957592010498, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.75, "sampling/sampling_logp_difference/mean": 0.2533400356769562, "step": 966, "step_time": 39.00550961800036 }, { "clip_ratio/high_max": 0.0030572223477065563, "clip_ratio/high_mean": 0.0030572223477065563, "clip_ratio/low_mean": 0.0045628484222106636, "clip_ratio/low_min": 0.0045628484222106636, "clip_ratio/region_mean": 0.00762007076991722, "completions/clipped_ratio": 0.0, "completions/max_length": 2866.0, "completions/max_terminated_length": 2866.0, "completions/mean_length": 2680.5, "completions/mean_terminated_length": 2680.5, "completions/min_length": 1858.0, "completions/min_terminated_length": 1858.0, "entropy": 0.03135753073729575, "epoch": 3.868000154720006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.5109857320785522, "kl": 0.13594754913356155, "learning_rate": 7.462979995211789e-06, "loss": 0.0018, "num_tokens": 29899799.0, "reward": 1.4278714656829834, "reward_std": 38.27693557739258, "rewards/rollout_reward_func/mean": 1.4278714656829834, "rewards/rollout_reward_func/std": 38.27694320678711, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.359375, "sampling/sampling_logp_difference/mean": 0.24906407296657562, "step": 967, "step_time": 38.14137981100066 }, { "clip_ratio/high_max": 0.003673719154903665, "clip_ratio/high_mean": 0.003673719154903665, "clip_ratio/low_mean": 0.0038237728585954756, "clip_ratio/low_min": 0.0038237728585954756, "clip_ratio/region_mean": 0.007497492013499141, "completions/clipped_ratio": 0.0, "completions/max_length": 2903.0, "completions/max_terminated_length": 2903.0, "completions/mean_length": 2539.0625, "completions/mean_terminated_length": 2539.0625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "entropy": 0.03051236690953374, "epoch": 3.8720001548800065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004165086895227432, "kl": 0.013218302163295448, "learning_rate": 7.462979995201497e-06, "loss": 0.0002, "num_tokens": 29953301.0, "reward": -1.4880821704864502, "reward_std": 27.358613967895508, "rewards/rollout_reward_func/mean": -1.4880821704864502, "rewards/rollout_reward_func/std": 27.358612060546875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.3125, "sampling/sampling_logp_difference/mean": 0.2458793818950653, "step": 968, "step_time": 37.07400778900774 }, { "clip_ratio/high_max": 0.0033412920020055026, "clip_ratio/high_mean": 0.0033412920020055026, "clip_ratio/low_mean": 0.004510985279921442, "clip_ratio/low_min": 0.004510985279921442, "clip_ratio/region_mean": 0.007852277311030775, "completions/clipped_ratio": 0.0, "completions/max_length": 2791.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 2603.6875, "completions/mean_terminated_length": 2603.6875, "completions/min_length": 2064.0, "completions/min_terminated_length": 2064.0, "entropy": 0.029925062553957105, "epoch": 3.876000155040006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005416702013462782, "kl": 0.014993553515523672, "learning_rate": 7.462979995191195e-06, "loss": 0.0002, "num_tokens": 30007822.0, "reward": 0.5951250791549683, "reward_std": 28.90692901611328, "rewards/rollout_reward_func/mean": 0.5951250791549683, "rewards/rollout_reward_func/std": 28.90692901611328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.359375, "sampling/sampling_logp_difference/mean": 0.2615281641483307, "step": 969, "step_time": 36.83284219201596 }, { "clip_ratio/high_max": 0.003934348642360419, "clip_ratio/high_mean": 0.003934348642360419, "clip_ratio/low_mean": 0.00392775476211682, "clip_ratio/low_min": 0.00392775476211682, "clip_ratio/region_mean": 0.007862103404477239, "completions/clipped_ratio": 0.0, "completions/max_length": 2910.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 2792.3125, "completions/mean_terminated_length": 2792.3125, "completions/min_length": 2668.0, "completions/min_terminated_length": 2668.0, "entropy": 0.02797484677284956, "epoch": 3.880000155200006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004042238462716341, "kl": 0.012883715680800378, "learning_rate": 7.46297999518088e-06, "loss": 0.0002, "num_tokens": 30065431.0, "reward": -6.38651180267334, "reward_std": 6.76867151260376, "rewards/rollout_reward_func/mean": -6.38651180267334, "rewards/rollout_reward_func/std": 6.76867151260376, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.25, "sampling/sampling_logp_difference/mean": 0.2371024489402771, "step": 970, "step_time": 37.5507366519887 }, { "clip_ratio/high_max": 0.003930716076865792, "clip_ratio/high_mean": 0.003930716076865792, "clip_ratio/low_mean": 0.004664383013732731, "clip_ratio/low_min": 0.004664383013732731, "clip_ratio/region_mean": 0.008595099032390863, "completions/clipped_ratio": 0.0, "completions/max_length": 2776.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 2505.25, "completions/mean_terminated_length": 2505.25, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.03129212511703372, "epoch": 3.8840001553600064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005777652841061354, "kl": 0.015716053312644362, "learning_rate": 7.462979995170556e-06, "loss": 0.0002, "num_tokens": 30118383.0, "reward": -4.338348388671875, "reward_std": 27.09004783630371, "rewards/rollout_reward_func/mean": -4.338348388671875, "rewards/rollout_reward_func/std": 27.09004783630371, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.6328125, "sampling/sampling_logp_difference/mean": 0.24574200809001923, "step": 971, "step_time": 35.876625919976505 }, { "clip_ratio/high_max": 0.002904748500441201, "clip_ratio/high_mean": 0.002904748500441201, "clip_ratio/low_mean": 0.004689087072620168, "clip_ratio/low_min": 0.004689087072620168, "clip_ratio/region_mean": 0.007593835587613285, "completions/clipped_ratio": 0.0, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 2746.75, "completions/mean_terminated_length": 2746.75, "completions/min_length": 2230.0, "completions/min_terminated_length": 2230.0, "entropy": 0.030264207161962986, "epoch": 3.888000155520006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012879430316388607, "kl": 0.01771774934604764, "learning_rate": 7.46297999516022e-06, "loss": 0.0002, "num_tokens": 30175265.0, "reward": -3.1194674968719482, "reward_std": 21.953643798828125, "rewards/rollout_reward_func/mean": -3.1194674968719482, "rewards/rollout_reward_func/std": 21.953645706176758, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.5, "sampling/sampling_logp_difference/mean": 0.23944279551506042, "step": 972, "step_time": 37.76481034401513 }, { "clip_ratio/high_max": 0.004128440952626988, "clip_ratio/high_mean": 0.004128440952626988, "clip_ratio/low_mean": 0.0035831480636261404, "clip_ratio/low_min": 0.0035831480636261404, "clip_ratio/region_mean": 0.007711588987149298, "completions/clipped_ratio": 0.0, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 2727.25, "completions/mean_terminated_length": 2727.25, "completions/min_length": 2517.0, "completions/min_terminated_length": 2517.0, "entropy": 0.03265003184787929, "epoch": 3.8920001556800065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009612726978957653, "kl": 0.017423639656044543, "learning_rate": 7.462979995149873e-06, "loss": 0.0002, "num_tokens": 30231811.0, "reward": 1.6061875820159912, "reward_std": 20.715530395507812, "rewards/rollout_reward_func/mean": 1.6061875820159912, "rewards/rollout_reward_func/std": 20.715532302856445, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.1875, "sampling/sampling_logp_difference/mean": 0.24283809959888458, "step": 973, "step_time": 37.59714694299328 }, { "clip_ratio/high_max": 0.0041752715478651226, "clip_ratio/high_mean": 0.0041752715478651226, "clip_ratio/low_mean": 0.004354596952907741, "clip_ratio/low_min": 0.004354596952907741, "clip_ratio/region_mean": 0.008529868500772864, "completions/clipped_ratio": 0.0, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 2624.75, "completions/mean_terminated_length": 2624.75, "completions/min_length": 2558.0, "completions/min_terminated_length": 2558.0, "entropy": 0.031364280031993985, "epoch": 3.896000155840006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005980250891298056, "kl": 0.017065050546079874, "learning_rate": 7.462979995139515e-06, "loss": 0.0002, "num_tokens": 30286687.0, "reward": -7.763956069946289, "reward_std": 11.065237998962402, "rewards/rollout_reward_func/mean": -7.763956069946289, "rewards/rollout_reward_func/std": 11.065237998962402, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.00783157348633, "sampling/sampling_logp_difference/mean": 0.2449338138103485, "step": 974, "step_time": 36.47628140600864 }, { "clip_ratio/high_max": 0.004478670365642756, "clip_ratio/high_mean": 0.004478670365642756, "clip_ratio/low_mean": 0.003973301063524559, "clip_ratio/low_min": 0.003973301063524559, "clip_ratio/region_mean": 0.008451971458271146, "completions/clipped_ratio": 0.0, "completions/max_length": 2851.0, "completions/max_terminated_length": 2851.0, "completions/mean_length": 2751.625, "completions/mean_terminated_length": 2751.625, "completions/min_length": 2600.0, "completions/min_terminated_length": 2600.0, "entropy": 0.03101369831711054, "epoch": 3.900000156000006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00709106307476759, "kl": 0.018240357167087495, "learning_rate": 7.462979995129145e-06, "loss": 0.0002, "num_tokens": 30343621.0, "reward": -10.67866325378418, "reward_std": 8.58933162689209, "rewards/rollout_reward_func/mean": -10.67866325378418, "rewards/rollout_reward_func/std": 8.58933162689209, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.875, "sampling/sampling_logp_difference/mean": 0.24240076541900635, "step": 975, "step_time": 38.26797653900576 }, { "clip_ratio/high_max": 0.003250535315601155, "clip_ratio/high_mean": 0.003250535315601155, "clip_ratio/low_mean": 0.003972041740780696, "clip_ratio/low_min": 0.003972041740780696, "clip_ratio/region_mean": 0.0072225770563818514, "completions/clipped_ratio": 0.0, "completions/max_length": 2849.0, "completions/max_terminated_length": 2849.0, "completions/mean_length": 2701.9375, "completions/mean_terminated_length": 2701.9375, "completions/min_length": 2534.0, "completions/min_terminated_length": 2534.0, "entropy": 0.03033800097182393, "epoch": 3.9040001561600064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.25998494029045105, "kl": 0.045771002070978284, "learning_rate": 7.462979995118766e-06, "loss": 0.0006, "num_tokens": 30399736.0, "reward": -4.201589584350586, "reward_std": 13.770731925964355, "rewards/rollout_reward_func/mean": -4.201589584350586, "rewards/rollout_reward_func/std": 13.770732879638672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.96875, "sampling/sampling_logp_difference/mean": 0.24049967527389526, "step": 976, "step_time": 37.87719321899931 }, { "clip_ratio/high_max": 0.004211904160911217, "clip_ratio/high_mean": 0.004211904160911217, "clip_ratio/low_mean": 0.003536746487952769, "clip_ratio/low_min": 0.003536746487952769, "clip_ratio/region_mean": 0.007748650677967817, "completions/clipped_ratio": 0.0, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 2731.5625, "completions/mean_terminated_length": 2731.5625, "completions/min_length": 2568.0, "completions/min_terminated_length": 2568.0, "entropy": 0.030719137052074075, "epoch": 3.908000156320006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0061806379817426205, "kl": 0.016347313532605767, "learning_rate": 7.462979995108374e-06, "loss": 0.0002, "num_tokens": 30456345.0, "reward": -6.873320579528809, "reward_std": 10.653471946716309, "rewards/rollout_reward_func/mean": -6.873320579528809, "rewards/rollout_reward_func/std": 10.653472900390625, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.765625, "sampling/sampling_logp_difference/mean": 0.23490451276302338, "step": 977, "step_time": 37.42599650002376 }, { "clip_ratio/high_max": 0.0027140277670696378, "clip_ratio/high_mean": 0.0027140277670696378, "clip_ratio/low_mean": 0.005136330350069329, "clip_ratio/low_min": 0.005136330350069329, "clip_ratio/region_mean": 0.007850358204450458, "completions/clipped_ratio": 0.0, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 2717.375, "completions/mean_terminated_length": 2717.375, "completions/min_length": 2479.0, "completions/min_terminated_length": 2479.0, "entropy": 0.031208484200760722, "epoch": 3.9120001564800066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00942419283092022, "kl": 0.01845557033084333, "learning_rate": 7.462979995097973e-06, "loss": 0.0002, "num_tokens": 30512739.0, "reward": -3.3281846046447754, "reward_std": 19.01272201538086, "rewards/rollout_reward_func/mean": -3.3281846046447754, "rewards/rollout_reward_func/std": 19.01272201538086, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.90625, "sampling/sampling_logp_difference/mean": 0.23338302969932556, "step": 978, "step_time": 37.59273409798334 }, { "clip_ratio/high_max": 0.002121610305039212, "clip_ratio/high_mean": 0.002121610305039212, "clip_ratio/low_mean": 0.005005185579648241, "clip_ratio/low_min": 0.005005185579648241, "clip_ratio/region_mean": 0.007126795942895114, "completions/clipped_ratio": 0.0, "completions/max_length": 2929.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 2623.25, "completions/mean_terminated_length": 2623.25, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.030612635891884565, "epoch": 3.916000156640006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01732192002236843, "kl": 0.022066951263695955, "learning_rate": 7.462979995087559e-06, "loss": 0.0003, "num_tokens": 30567614.0, "reward": 1.6641011238098145, "reward_std": 36.51286315917969, "rewards/rollout_reward_func/mean": 1.6641011238098145, "rewards/rollout_reward_func/std": 36.51286315917969, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.9375, "sampling/sampling_logp_difference/mean": 0.23837649822235107, "step": 979, "step_time": 37.22204584800056 }, { "clip_ratio/high_max": 0.004338175436714664, "clip_ratio/high_mean": 0.004338175436714664, "clip_ratio/low_mean": 0.0034646079875528812, "clip_ratio/low_min": 0.0034646079875528812, "clip_ratio/region_mean": 0.007802783336956054, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 2810.25, "completions/mean_terminated_length": 2810.25, "completions/min_length": 2697.0, "completions/min_terminated_length": 2697.0, "entropy": 0.030305434251204133, "epoch": 3.920000156800006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01400668453425169, "kl": 0.02015350095462054, "learning_rate": 7.462979995077134e-06, "loss": 0.0003, "num_tokens": 30625542.0, "reward": -10.887290954589844, "reward_std": 11.11661434173584, "rewards/rollout_reward_func/mean": -10.887290954589844, "rewards/rollout_reward_func/std": 11.11661434173584, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.0078125, "sampling/sampling_logp_difference/mean": 0.22477902472019196, "step": 980, "step_time": 38.75668694397609 }, { "clip_ratio/high_max": 0.0031805273320060223, "clip_ratio/high_mean": 0.0031805273320060223, "clip_ratio/low_mean": 0.0038618000398855656, "clip_ratio/low_min": 0.0038618000398855656, "clip_ratio/region_mean": 0.007042327371891588, "completions/clipped_ratio": 0.0, "completions/max_length": 2904.0, "completions/max_terminated_length": 2904.0, "completions/mean_length": 2766.375, "completions/mean_terminated_length": 2766.375, "completions/min_length": 2078.0, "completions/min_terminated_length": 2078.0, "entropy": 0.031245159218087792, "epoch": 3.9240001569600065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009367636404931545, "kl": 0.019263932830654085, "learning_rate": 7.4629799950666995e-06, "loss": 0.0003, "num_tokens": 30682733.0, "reward": -2.960279703140259, "reward_std": 26.899852752685547, "rewards/rollout_reward_func/mean": -2.960279703140259, "rewards/rollout_reward_func/std": 26.89985466003418, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.0625, "sampling/sampling_logp_difference/mean": 0.23731166124343872, "step": 981, "step_time": 37.963436513004126 }, { "clip_ratio/high_max": 0.0023197015252662823, "clip_ratio/high_mean": 0.0023197015252662823, "clip_ratio/low_mean": 0.0050940609944518656, "clip_ratio/low_min": 0.0050940609944518656, "clip_ratio/region_mean": 0.007413762388750911, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 2512.6875, "completions/mean_terminated_length": 2512.6875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.03153142682276666, "epoch": 3.928000157120006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008599725551903248, "kl": 0.021280436660163105, "learning_rate": 7.462979995056254e-06, "loss": 0.0003, "num_tokens": 30735799.0, "reward": -0.16565239429473877, "reward_std": 28.081125259399414, "rewards/rollout_reward_func/mean": -0.16565239429473877, "rewards/rollout_reward_func/std": 28.08112335205078, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.74020767211914, "sampling/sampling_logp_difference/mean": 0.24125130474567413, "step": 982, "step_time": 37.14654406499176 }, { "clip_ratio/high_max": 0.004098050616448745, "clip_ratio/high_mean": 0.004098050616448745, "clip_ratio/low_mean": 0.003533594368491322, "clip_ratio/low_min": 0.003533594368491322, "clip_ratio/region_mean": 0.0076316449558362365, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 2587.4375, "completions/mean_terminated_length": 2587.4375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.03112421534024179, "epoch": 3.932000157280006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.014270296320319176, "kl": 0.021643718820996583, "learning_rate": 7.4629799950457964e-06, "loss": 0.0003, "num_tokens": 30790105.0, "reward": -9.021422386169434, "reward_std": 28.75621223449707, "rewards/rollout_reward_func/mean": -9.021422386169434, "rewards/rollout_reward_func/std": 28.756214141845703, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.08601760864258, "sampling/sampling_logp_difference/mean": 0.23379622399806976, "step": 983, "step_time": 36.89018044799741 }, { "clip_ratio/high_max": 0.003872925299219787, "clip_ratio/high_mean": 0.003872925299219787, "clip_ratio/low_mean": 0.0038434338348452, "clip_ratio/low_min": 0.0038434338348452, "clip_ratio/region_mean": 0.007716359104961157, "completions/clipped_ratio": 0.0, "completions/max_length": 2906.0, "completions/max_terminated_length": 2906.0, "completions/mean_length": 2801.4375, "completions/mean_terminated_length": 2801.4375, "completions/min_length": 2600.0, "completions/min_terminated_length": 2600.0, "entropy": 0.030868998263031244, "epoch": 3.9360001574400064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012837451882660389, "kl": 0.021568106021732092, "learning_rate": 7.462979995035327e-06, "loss": 0.0003, "num_tokens": 30847847.0, "reward": -11.25865650177002, "reward_std": 8.389006614685059, "rewards/rollout_reward_func/mean": -11.25865650177002, "rewards/rollout_reward_func/std": 8.389007568359375, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.2578125, "sampling/sampling_logp_difference/mean": 0.2253459095954895, "step": 984, "step_time": 37.73648962799052 }, { "clip_ratio/high_max": 0.004049334995215759, "clip_ratio/high_mean": 0.004049334995215759, "clip_ratio/low_mean": 0.0034867784997913986, "clip_ratio/low_min": 0.0034867784997913986, "clip_ratio/region_mean": 0.007536113495007157, "completions/clipped_ratio": 0.0, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 2679.0625, "completions/mean_terminated_length": 2679.0625, "completions/min_length": 2567.0, "completions/min_terminated_length": 2567.0, "entropy": 0.029833376174792647, "epoch": 3.940000157600006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0065703438594937325, "kl": 0.019535499275662005, "learning_rate": 7.462979995024848e-06, "loss": 0.0003, "num_tokens": 30903616.0, "reward": -5.574375629425049, "reward_std": 13.661006927490234, "rewards/rollout_reward_func/mean": -5.574375629425049, "rewards/rollout_reward_func/std": 13.661008834838867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.28125, "sampling/sampling_logp_difference/mean": 0.24040842056274414, "step": 985, "step_time": 36.8657513930375 }, { "clip_ratio/high_max": 0.0023087701556505635, "clip_ratio/high_mean": 0.0023087701556505635, "clip_ratio/low_mean": 0.005642223812174052, "clip_ratio/low_min": 0.005642223812174052, "clip_ratio/region_mean": 0.0079509939532727, "completions/clipped_ratio": 0.0, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 2484.375, "completions/mean_terminated_length": 2484.375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.030102995224297047, "epoch": 3.9440001577600066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00716320751234889, "kl": 0.019778860732913017, "learning_rate": 7.462979995014357e-06, "loss": 0.0002, "num_tokens": 30956319.0, "reward": 2.4449756145477295, "reward_std": 32.860877990722656, "rewards/rollout_reward_func/mean": 2.4449756145477295, "rewards/rollout_reward_func/std": 32.86088180541992, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.75, "sampling/sampling_logp_difference/mean": 0.23518647253513336, "step": 986, "step_time": 36.09189537598286 }, { "clip_ratio/high_max": 0.002520773181458935, "clip_ratio/high_mean": 0.002520773181458935, "clip_ratio/low_mean": 0.005301777127897367, "clip_ratio/low_min": 0.005301777127897367, "clip_ratio/region_mean": 0.007822550309356302, "completions/clipped_ratio": 0.0, "completions/max_length": 2870.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 2566.75, "completions/mean_terminated_length": 2566.75, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.031788201769813895, "epoch": 3.948000157920006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013305135071277618, "kl": 0.023203657008707523, "learning_rate": 7.4629799950038565e-06, "loss": 0.0003, "num_tokens": 31010293.0, "reward": 0.6275300979614258, "reward_std": 29.41556739807129, "rewards/rollout_reward_func/mean": 0.6275300979614258, "rewards/rollout_reward_func/std": 29.415569305419922, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.557952880859375, "sampling/sampling_logp_difference/mean": 0.24758876860141754, "step": 987, "step_time": 37.20890147298633 }, { "clip_ratio/high_max": 0.003480063664028421, "clip_ratio/high_mean": 0.003480063664028421, "clip_ratio/low_mean": 0.004339981387602165, "clip_ratio/low_min": 0.004339981387602165, "clip_ratio/region_mean": 0.007820045109838247, "completions/clipped_ratio": 0.0, "completions/max_length": 2869.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 2579.9375, "completions/mean_terminated_length": 2579.9375, "completions/min_length": 1590.0, "completions/min_terminated_length": 1590.0, "entropy": 0.03454134170897305, "epoch": 3.952000158080006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006756577640771866, "kl": 0.020443899324163795, "learning_rate": 7.462979994993342e-06, "loss": 0.0003, "num_tokens": 31064452.0, "reward": 7.256887435913086, "reward_std": 55.26766586303711, "rewards/rollout_reward_func/mean": 7.256887435913086, "rewards/rollout_reward_func/std": 55.26766586303711, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.5, "sampling/sampling_logp_difference/mean": 0.26485779881477356, "step": 988, "step_time": 37.464829381002346 }, { "clip_ratio/high_max": 0.0037896028661634773, "clip_ratio/high_mean": 0.0037896028661634773, "clip_ratio/low_mean": 0.0038046571717131883, "clip_ratio/low_min": 0.0038046571717131883, "clip_ratio/region_mean": 0.007594260037876666, "completions/clipped_ratio": 0.0, "completions/max_length": 2909.0, "completions/max_terminated_length": 2909.0, "completions/mean_length": 2749.625, "completions/mean_terminated_length": 2749.625, "completions/min_length": 2551.0, "completions/min_terminated_length": 2551.0, "entropy": 0.02914240723475814, "epoch": 3.9560001582400065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005163250491023064, "kl": 0.019007271737791598, "learning_rate": 7.462979994982819e-06, "loss": 0.0003, "num_tokens": 31121346.0, "reward": -9.728962898254395, "reward_std": 11.566259384155273, "rewards/rollout_reward_func/mean": -9.728962898254395, "rewards/rollout_reward_func/std": 11.566259384155273, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.015625, "sampling/sampling_logp_difference/mean": 0.23458147048950195, "step": 989, "step_time": 37.403250538001885 }, { "clip_ratio/high_max": 0.004294097452657297, "clip_ratio/high_mean": 0.004294097452657297, "clip_ratio/low_mean": 0.002888761315261945, "clip_ratio/low_min": 0.002888761315261945, "clip_ratio/region_mean": 0.007182858767919242, "completions/clipped_ratio": 0.0, "completions/max_length": 2764.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 2666.0, "completions/mean_terminated_length": 2666.0, "completions/min_length": 2501.0, "completions/min_terminated_length": 2501.0, "entropy": 0.03209766116924584, "epoch": 3.960000158400006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013873312622308731, "kl": 0.02219047979451716, "learning_rate": 7.462979994972285e-06, "loss": 0.0003, "num_tokens": 31176874.0, "reward": -2.9484667778015137, "reward_std": 16.691421508789062, "rewards/rollout_reward_func/mean": -2.9484667778015137, "rewards/rollout_reward_func/std": 16.691423416137695, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.9375, "sampling/sampling_logp_difference/mean": 0.24720752239227295, "step": 990, "step_time": 36.407954035981675 }, { "clip_ratio/high_max": 0.002134559763362631, "clip_ratio/high_mean": 0.002134559763362631, "clip_ratio/low_mean": 0.004567424184642732, "clip_ratio/low_min": 0.004567424184642732, "clip_ratio/region_mean": 0.006701983918901533, "completions/clipped_ratio": 0.0, "completions/max_length": 2938.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 2469.875, "completions/mean_terminated_length": 2469.875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.02956771827302873, "epoch": 3.9640001585600066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006700300145894289, "kl": 0.019366679713129997, "learning_rate": 7.462979994961739e-06, "loss": 0.0002, "num_tokens": 31229325.0, "reward": 3.8337690830230713, "reward_std": 33.33778381347656, "rewards/rollout_reward_func/mean": 3.8337690830230713, "rewards/rollout_reward_func/std": 33.33778381347656, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.3125, "sampling/sampling_logp_difference/mean": 0.23867680132389069, "step": 991, "step_time": 36.480929667988676 }, { "clip_ratio/high_max": 0.004465509031433612, "clip_ratio/high_mean": 0.004465509031433612, "clip_ratio/low_mean": 0.004201293893856928, "clip_ratio/low_min": 0.004201293893856928, "clip_ratio/region_mean": 0.008666802837979048, "completions/clipped_ratio": 0.0, "completions/max_length": 2878.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 2720.0, "completions/mean_terminated_length": 2720.0, "completions/min_length": 2577.0, "completions/min_terminated_length": 2577.0, "entropy": 0.02935936558060348, "epoch": 3.9680001587200064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005390654783695936, "kl": 0.017259097890928388, "learning_rate": 7.462979994951181e-06, "loss": 0.0002, "num_tokens": 31285748.0, "reward": -6.346534252166748, "reward_std": 9.097856521606445, "rewards/rollout_reward_func/mean": -6.346534252166748, "rewards/rollout_reward_func/std": 9.097856521606445, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.172630310058594, "sampling/sampling_logp_difference/mean": 0.23700696229934692, "step": 992, "step_time": 38.18977178398927 }, { "clip_ratio/high_max": 0.0038262180460151285, "clip_ratio/high_mean": 0.0038262180460151285, "clip_ratio/low_mean": 0.004023758345283568, "clip_ratio/low_min": 0.004023758345283568, "clip_ratio/region_mean": 0.007849976362194866, "completions/clipped_ratio": 0.0, "completions/max_length": 2869.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 2690.625, "completions/mean_terminated_length": 2690.625, "completions/min_length": 2564.0, "completions/min_terminated_length": 2564.0, "entropy": 0.031450238078832626, "epoch": 3.972000158880006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0057232994586229324, "kl": 0.018244453007355332, "learning_rate": 7.462979994940614e-06, "loss": 0.0002, "num_tokens": 31341691.0, "reward": -7.221112251281738, "reward_std": 15.028115272521973, "rewards/rollout_reward_func/mean": -7.221112251281738, "rewards/rollout_reward_func/std": 15.028115272521973, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.3984375, "sampling/sampling_logp_difference/mean": 0.24701566994190216, "step": 993, "step_time": 37.6173576329893 }, { "clip_ratio/high_max": 0.0039832572801969945, "clip_ratio/high_mean": 0.0039832572801969945, "clip_ratio/low_mean": 0.003584950929507613, "clip_ratio/low_min": 0.003584950929507613, "clip_ratio/region_mean": 0.007568208209704608, "completions/clipped_ratio": 0.0, "completions/max_length": 2910.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 2737.1875, "completions/mean_terminated_length": 2737.1875, "completions/min_length": 2533.0, "completions/min_terminated_length": 2533.0, "entropy": 0.03181788674555719, "epoch": 3.9760001590400065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00616992823779583, "kl": 0.019263217691332102, "learning_rate": 7.462979994930035e-06, "loss": 0.0003, "num_tokens": 31398405.0, "reward": -6.228950500488281, "reward_std": 10.444253921508789, "rewards/rollout_reward_func/mean": -6.228950500488281, "rewards/rollout_reward_func/std": 10.444253921508789, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.0625, "sampling/sampling_logp_difference/mean": 0.23651500046253204, "step": 994, "step_time": 37.57257907500025 }, { "clip_ratio/high_max": 0.0037037654547020793, "clip_ratio/high_mean": 0.0037037654547020793, "clip_ratio/low_mean": 0.004447691928362474, "clip_ratio/low_min": 0.004447691928362474, "clip_ratio/region_mean": 0.008151457412168384, "completions/clipped_ratio": 0.0, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 2748.5625, "completions/mean_terminated_length": 2748.5625, "completions/min_length": 2599.0, "completions/min_terminated_length": 2599.0, "entropy": 0.029218169394880533, "epoch": 3.980000159200006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0056642768904566765, "kl": 0.016145426081493497, "learning_rate": 7.4629799949194445e-06, "loss": 0.0002, "num_tokens": 31455297.0, "reward": -7.937997817993164, "reward_std": 6.226927757263184, "rewards/rollout_reward_func/mean": -7.937997817993164, "rewards/rollout_reward_func/std": 6.226928234100342, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.53125, "sampling/sampling_logp_difference/mean": 0.244075208902359, "step": 995, "step_time": 37.58478498700424 }, { "clip_ratio/high_max": 0.004333659133408219, "clip_ratio/high_mean": 0.004333659133408219, "clip_ratio/low_mean": 0.0033503101440146565, "clip_ratio/low_min": 0.0033503101440146565, "clip_ratio/region_mean": 0.007683969277422875, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 2565.4375, "completions/mean_terminated_length": 2565.4375, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "entropy": 0.030343707418069243, "epoch": 3.984000159360007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0059464010410010815, "kl": 0.020423763548024, "learning_rate": 7.462979994908844e-06, "loss": 0.0002, "num_tokens": 31509245.0, "reward": -1.8254725933074951, "reward_std": 27.779354095458984, "rewards/rollout_reward_func/mean": -1.8254725933074951, "rewards/rollout_reward_func/std": 27.779354095458984, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.6875114440918, "sampling/sampling_logp_difference/mean": 0.24503399431705475, "step": 996, "step_time": 36.74526026599051 }, { "clip_ratio/high_max": 0.004438839387148619, "clip_ratio/high_mean": 0.004438839387148619, "clip_ratio/low_mean": 0.0034200374357169494, "clip_ratio/low_min": 0.0034200374357169494, "clip_ratio/region_mean": 0.007858876837417483, "completions/clipped_ratio": 0.0, "completions/max_length": 2787.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 2584.75, "completions/mean_terminated_length": 2584.75, "completions/min_length": 1464.0, "completions/min_terminated_length": 1464.0, "entropy": 0.030799251282587647, "epoch": 3.9880001595200064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02004532516002655, "kl": 0.021401040605269372, "learning_rate": 7.4629799948982314e-06, "loss": 0.0003, "num_tokens": 31563477.0, "reward": -1.4612984657287598, "reward_std": 31.56725311279297, "rewards/rollout_reward_func/mean": -1.4612984657287598, "rewards/rollout_reward_func/std": 31.56725311279297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.044925689697266, "sampling/sampling_logp_difference/mean": 0.2480698972940445, "step": 997, "step_time": 36.627891684984206 }, { "clip_ratio/high_max": 0.003497269906802103, "clip_ratio/high_mean": 0.003497269906802103, "clip_ratio/low_mean": 0.004107315238798037, "clip_ratio/low_min": 0.004107315238798037, "clip_ratio/region_mean": 0.00760458514560014, "completions/clipped_ratio": 0.0, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 2648.1875, "completions/mean_terminated_length": 2648.1875, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "entropy": 0.028214096324518323, "epoch": 3.992000159680006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008580394089221954, "kl": 0.0187166704563424, "learning_rate": 7.462979994887608e-06, "loss": 0.0002, "num_tokens": 31618791.0, "reward": -6.939608573913574, "reward_std": 23.071067810058594, "rewards/rollout_reward_func/mean": -6.939608573913574, "rewards/rollout_reward_func/std": 23.07106590270996, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.5625, "sampling/sampling_logp_difference/mean": 0.23945313692092896, "step": 998, "step_time": 37.34876495900971 }, { "clip_ratio/high_max": 0.0037270078901201487, "clip_ratio/high_mean": 0.0037270078901201487, "clip_ratio/low_mean": 0.004285179544240236, "clip_ratio/low_min": 0.004285179544240236, "clip_ratio/region_mean": 0.008012187376152724, "completions/clipped_ratio": 0.0, "completions/max_length": 2913.0, "completions/max_terminated_length": 2913.0, "completions/mean_length": 2710.0, "completions/mean_terminated_length": 2710.0, "completions/min_length": 1517.0, "completions/min_terminated_length": 1517.0, "entropy": 0.031236187554895878, "epoch": 3.9960001598400066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005633962340652943, "kl": 0.017847066279500723, "learning_rate": 7.462979994876974e-06, "loss": 0.0002, "num_tokens": 31675080.0, "reward": 3.037426471710205, "reward_std": 27.766130447387695, "rewards/rollout_reward_func/mean": 3.037426471710205, "rewards/rollout_reward_func/std": 27.766132354736328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.218780517578125, "sampling/sampling_logp_difference/mean": 0.2444852590560913, "step": 999, "step_time": 37.57170789397787 }, { "clip_ratio/high_max": 0.0015347081935033202, "clip_ratio/high_mean": 0.0015347081935033202, "clip_ratio/low_mean": 0.005605575512163341, "clip_ratio/low_min": 0.005605575512163341, "clip_ratio/region_mean": 0.007140283705666661, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 2366.375, "completions/mean_terminated_length": 2366.375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.03132412466220558, "epoch": 4.000000160000006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003899159375578165, "kl": 0.012555969995446503, "learning_rate": 7.46297999486633e-06, "loss": 0.0002, "num_tokens": 31725844.0, "reward": 12.478002548217773, "reward_std": 44.761600494384766, "rewards/rollout_reward_func/mean": 12.478002548217773, "rewards/rollout_reward_func/std": 44.761600494384766, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.15625, "sampling/sampling_logp_difference/mean": 0.24335362017154694, "step": 1000, "step_time": 35.63440906300093 }, { "clip_ratio/high_max": 0.0044113644398748875, "clip_ratio/high_mean": 0.0044113644398748875, "clip_ratio/low_mean": 0.003727744275238365, "clip_ratio/low_min": 0.003727744275238365, "clip_ratio/region_mean": 0.008139108773320913, "completions/clipped_ratio": 0.0, "completions/max_length": 2823.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 2748.9375, "completions/mean_terminated_length": 2748.9375, "completions/min_length": 2683.0, "completions/min_terminated_length": 2683.0, "entropy": 0.028273491421714425, "epoch": 4.004000160160006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006778429728001356, "kl": 0.0190530635882169, "learning_rate": 7.462979994855673e-06, "loss": 0.0003, "num_tokens": 31782746.0, "reward": -9.371927261352539, "reward_std": 10.029258728027344, "rewards/rollout_reward_func/mean": -9.371927261352539, "rewards/rollout_reward_func/std": 10.029258728027344, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.546875, "sampling/sampling_logp_difference/mean": 0.24237915873527527, "step": 1001, "step_time": 37.92107174402918 }, { "clip_ratio/high_max": 0.004700639023212716, "clip_ratio/high_mean": 0.004700639023212716, "clip_ratio/low_mean": 0.0038691559457220137, "clip_ratio/low_min": 0.0038691559457220137, "clip_ratio/region_mean": 0.00856979499803856, "completions/clipped_ratio": 0.0, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 2681.9375, "completions/mean_terminated_length": 2681.9375, "completions/min_length": 1479.0, "completions/min_terminated_length": 1479.0, "entropy": 0.030645038932561874, "epoch": 4.0080001603200065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.016799475997686386, "kl": 0.019840675056912005, "learning_rate": 7.462979994845006e-06, "loss": 0.0003, "num_tokens": 31838577.0, "reward": 2.0174942016601562, "reward_std": 33.7762451171875, "rewards/rollout_reward_func/mean": 2.0174942016601562, "rewards/rollout_reward_func/std": 33.7762451171875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.0625114440918, "sampling/sampling_logp_difference/mean": 0.24527312815189362, "step": 1002, "step_time": 37.572154294961365 }, { "clip_ratio/high_max": 0.0034517166495788842, "clip_ratio/high_mean": 0.0034517166495788842, "clip_ratio/low_mean": 0.003938140289392322, "clip_ratio/low_min": 0.003938140289392322, "clip_ratio/region_mean": 0.007389856968075037, "completions/clipped_ratio": 0.0, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 2741.625, "completions/mean_terminated_length": 2741.625, "completions/min_length": 2605.0, "completions/min_terminated_length": 2605.0, "entropy": 0.029692498268559575, "epoch": 4.012000160480006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013358141295611858, "kl": 0.01868908922187984, "learning_rate": 7.462979994834328e-06, "loss": 0.0002, "num_tokens": 31895369.0, "reward": -8.6488676071167, "reward_std": 6.828946590423584, "rewards/rollout_reward_func/mean": -8.6488676071167, "rewards/rollout_reward_func/std": 6.828947067260742, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.375, "sampling/sampling_logp_difference/mean": 0.23611770570278168, "step": 1003, "step_time": 37.54012080398388 }, { "clip_ratio/high_max": 0.0026313474954804406, "clip_ratio/high_mean": 0.0026313474954804406, "clip_ratio/low_mean": 0.005999698973027989, "clip_ratio/low_min": 0.005999698973027989, "clip_ratio/region_mean": 0.008631046337541193, "completions/clipped_ratio": 0.0, "completions/max_length": 2796.0, "completions/max_terminated_length": 2796.0, "completions/mean_length": 2372.8125, "completions/mean_terminated_length": 2372.8125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.029268067562952638, "epoch": 4.0160001606400067e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009233122691512108, "kl": 0.018622168339788914, "learning_rate": 7.4629799948236375e-06, "loss": 0.0002, "num_tokens": 31946239.0, "reward": 5.606559753417969, "reward_std": 35.2353401184082, "rewards/rollout_reward_func/mean": 5.606559753417969, "rewards/rollout_reward_func/std": 35.23534393310547, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.125, "sampling/sampling_logp_difference/mean": 0.25189313292503357, "step": 1004, "step_time": 35.451386183995055 }, { "clip_ratio/high_max": 0.005219108919845894, "clip_ratio/high_mean": 0.005219108919845894, "clip_ratio/low_mean": 0.002774011081783101, "clip_ratio/low_min": 0.002774011081783101, "clip_ratio/region_mean": 0.007993120059836656, "completions/clipped_ratio": 0.0, "completions/max_length": 2964.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 2784.5, "completions/mean_terminated_length": 2784.5, "completions/min_length": 2656.0, "completions/min_terminated_length": 2656.0, "entropy": 0.028158560395240784, "epoch": 4.0200001608000064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004462467040866613, "kl": 0.016051304643042386, "learning_rate": 7.462979994812938e-06, "loss": 0.0002, "num_tokens": 32003707.0, "reward": -6.529419898986816, "reward_std": 9.313372611999512, "rewards/rollout_reward_func/mean": -6.529419898986816, "rewards/rollout_reward_func/std": 9.313373565673828, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.625, "sampling/sampling_logp_difference/mean": 0.23456543684005737, "step": 1005, "step_time": 38.75712672898953 }, { "clip_ratio/high_max": 0.00382355839246884, "clip_ratio/high_mean": 0.00382355839246884, "clip_ratio/low_mean": 0.004436418297700584, "clip_ratio/low_min": 0.004436418297700584, "clip_ratio/region_mean": 0.008259976690169424, "completions/clipped_ratio": 0.0, "completions/max_length": 2955.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 2814.1875, "completions/mean_terminated_length": 2814.1875, "completions/min_length": 2663.0, "completions/min_terminated_length": 2663.0, "entropy": 0.027557983295992017, "epoch": 4.024000160960006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005701098125427961, "kl": 0.015739836962893605, "learning_rate": 7.462979994802225e-06, "loss": 0.0002, "num_tokens": 32061667.0, "reward": -6.437155246734619, "reward_std": 12.113664627075195, "rewards/rollout_reward_func/mean": -6.437155246734619, "rewards/rollout_reward_func/std": 12.113664627075195, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.375003814697266, "sampling/sampling_logp_difference/mean": 0.23480407893657684, "step": 1006, "step_time": 38.3103453740041 }, { "clip_ratio/high_max": 0.004118372191442177, "clip_ratio/high_mean": 0.004118372191442177, "clip_ratio/low_mean": 0.003886686870828271, "clip_ratio/low_min": 0.003886686870828271, "clip_ratio/region_mean": 0.008005059091374278, "completions/clipped_ratio": 0.0, "completions/max_length": 2921.0, "completions/max_terminated_length": 2921.0, "completions/mean_length": 2754.5625, "completions/mean_terminated_length": 2754.5625, "completions/min_length": 2549.0, "completions/min_terminated_length": 2549.0, "entropy": 0.029307747958227992, "epoch": 4.0280001611200066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011958183720707893, "kl": 0.019467074889689684, "learning_rate": 7.462979994791504e-06, "loss": 0.0003, "num_tokens": 32118651.0, "reward": -12.28847885131836, "reward_std": 11.691740989685059, "rewards/rollout_reward_func/mean": -12.28847885131836, "rewards/rollout_reward_func/std": 11.691741943359375, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.875, "sampling/sampling_logp_difference/mean": 0.2497502714395523, "step": 1007, "step_time": 37.524292154979776 }, { "clip_ratio/high_max": 0.003398056316655129, "clip_ratio/high_mean": 0.003398056316655129, "clip_ratio/low_mean": 0.004242947208695114, "clip_ratio/low_min": 0.004242947208695114, "clip_ratio/region_mean": 0.007641003583557904, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 2703.9375, "completions/mean_terminated_length": 2703.9375, "completions/min_length": 2596.0, "completions/min_terminated_length": 2596.0, "entropy": 0.027764456812292337, "epoch": 4.032000161280006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004905745852738619, "kl": 0.01587540318723768, "learning_rate": 7.462979994780769e-06, "loss": 0.0002, "num_tokens": 32174825.0, "reward": -3.8041529655456543, "reward_std": 7.580173969268799, "rewards/rollout_reward_func/mean": -3.8041529655456543, "rewards/rollout_reward_func/std": 7.580174446105957, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.15625, "sampling/sampling_logp_difference/mean": 0.24635745584964752, "step": 1008, "step_time": 40.6054577089817 }, { "clip_ratio/high_max": 0.0031437576399184763, "clip_ratio/high_mean": 0.0031437576399184763, "clip_ratio/low_mean": 0.0041589129832573235, "clip_ratio/low_min": 0.0041589129832573235, "clip_ratio/region_mean": 0.0073026706231758, "completions/clipped_ratio": 0.0, "completions/max_length": 2739.0, "completions/max_terminated_length": 2739.0, "completions/mean_length": 2505.25, "completions/mean_terminated_length": 2505.25, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.030754949897527695, "epoch": 4.036000161440007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00822599045932293, "kl": 0.018122458714060485, "learning_rate": 7.4629799947700245e-06, "loss": 0.0002, "num_tokens": 32227799.0, "reward": 1.614790916442871, "reward_std": 28.83724594116211, "rewards/rollout_reward_func/mean": 1.614790916442871, "rewards/rollout_reward_func/std": 28.83724594116211, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.21875, "sampling/sampling_logp_difference/mean": 0.25737255811691284, "step": 1009, "step_time": 36.60224286101584 }, { "clip_ratio/high_max": 0.0036281794600654393, "clip_ratio/high_mean": 0.0036281794600654393, "clip_ratio/low_mean": 0.003943693533074111, "clip_ratio/low_min": 0.003943693533074111, "clip_ratio/region_mean": 0.0075718730222433805, "completions/clipped_ratio": 0.0, "completions/max_length": 2888.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 2681.8125, "completions/mean_terminated_length": 2681.8125, "completions/min_length": 2195.0, "completions/min_terminated_length": 2195.0, "entropy": 0.03157367603853345, "epoch": 4.0400001616000065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00877293385565281, "kl": 0.01672227564267814, "learning_rate": 7.462979994759269e-06, "loss": 0.0002, "num_tokens": 32283602.0, "reward": 4.881223678588867, "reward_std": 37.2124137878418, "rewards/rollout_reward_func/mean": 4.881223678588867, "rewards/rollout_reward_func/std": 37.2124137878418, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 42.6875, "sampling/sampling_logp_difference/mean": 0.24629253149032593, "step": 1010, "step_time": 37.47978897400026 }, { "clip_ratio/high_max": 0.004137350915698335, "clip_ratio/high_mean": 0.004137350915698335, "clip_ratio/low_mean": 0.0033991050731856376, "clip_ratio/low_min": 0.0033991050731856376, "clip_ratio/region_mean": 0.007536455930676311, "completions/clipped_ratio": 0.0, "completions/max_length": 2897.0, "completions/max_terminated_length": 2897.0, "completions/mean_length": 2711.625, "completions/mean_terminated_length": 2711.625, "completions/min_length": 2566.0, "completions/min_terminated_length": 2566.0, "entropy": 0.030030966736376286, "epoch": 4.044000161760006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00976866390556097, "kl": 0.015367253799922764, "learning_rate": 7.462979994748503e-06, "loss": 0.0002, "num_tokens": 32339888.0, "reward": -6.843618392944336, "reward_std": 7.666231632232666, "rewards/rollout_reward_func/mean": -6.843618392944336, "rewards/rollout_reward_func/std": 7.666232109069824, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.0, "sampling/sampling_logp_difference/mean": 0.24174930155277252, "step": 1011, "step_time": 37.76264703601191 }, { "clip_ratio/high_max": 0.0022963201627135277, "clip_ratio/high_mean": 0.0022963201627135277, "clip_ratio/low_mean": 0.005895344540476799, "clip_ratio/low_min": 0.005895344540476799, "clip_ratio/region_mean": 0.008191664761397988, "completions/clipped_ratio": 0.0, "completions/max_length": 2856.0, "completions/max_terminated_length": 2856.0, "completions/mean_length": 2547.4375, "completions/mean_terminated_length": 2547.4375, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.030088389990851283, "epoch": 4.0480001619200066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004907436203211546, "kl": 0.014676430146209896, "learning_rate": 7.4629799947377255e-06, "loss": 0.0002, "num_tokens": 32393552.0, "reward": 10.958284378051758, "reward_std": 39.46455383300781, "rewards/rollout_reward_func/mean": 10.958284378051758, "rewards/rollout_reward_func/std": 39.46455764770508, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.84382629394531, "sampling/sampling_logp_difference/mean": 0.24889113008975983, "step": 1012, "step_time": 37.42688308398647 }, { "clip_ratio/high_max": 0.0041311785753350705, "clip_ratio/high_mean": 0.0041311785753350705, "clip_ratio/low_mean": 0.0036256361636333168, "clip_ratio/low_min": 0.0036256361636333168, "clip_ratio/region_mean": 0.007756814651656896, "completions/clipped_ratio": 0.0, "completions/max_length": 2709.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 2628.0625, "completions/mean_terminated_length": 2628.0625, "completions/min_length": 2575.0, "completions/min_terminated_length": 2575.0, "entropy": 0.030223648762330413, "epoch": 4.0520001620800064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01247127540409565, "kl": 0.01673278270754963, "learning_rate": 7.462979994726936e-06, "loss": 0.0002, "num_tokens": 32448473.0, "reward": -11.072691917419434, "reward_std": 8.578638076782227, "rewards/rollout_reward_func/mean": -11.072691917419434, "rewards/rollout_reward_func/std": 8.578639030456543, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.60160827636719, "sampling/sampling_logp_difference/mean": 0.25537994503974915, "step": 1013, "step_time": 37.17618025401316 }, { "clip_ratio/high_max": 0.003830880596069619, "clip_ratio/high_mean": 0.003830880596069619, "clip_ratio/low_mean": 0.003979310160502791, "clip_ratio/low_min": 0.003979310160502791, "clip_ratio/region_mean": 0.007810190785676241, "completions/clipped_ratio": 0.0, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 2802.5625, "completions/mean_terminated_length": 2802.5625, "completions/min_length": 2662.0, "completions/min_terminated_length": 2662.0, "entropy": 0.0294763317797333, "epoch": 4.056000162240007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.014862554147839546, "kl": 0.020930989179760218, "learning_rate": 7.4629799947161355e-06, "loss": 0.0003, "num_tokens": 32506257.0, "reward": -10.194786071777344, "reward_std": 10.719767570495605, "rewards/rollout_reward_func/mean": -10.194786071777344, "rewards/rollout_reward_func/std": 10.719767570495605, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.28125, "sampling/sampling_logp_difference/mean": 0.2365216761827469, "step": 1014, "step_time": 37.95642482803669 }, { "clip_ratio/high_max": 0.0048269471735693514, "clip_ratio/high_mean": 0.0048269471735693514, "clip_ratio/low_mean": 0.0030281704966910183, "clip_ratio/low_min": 0.0030281704966910183, "clip_ratio/region_mean": 0.00785511767026037, "completions/clipped_ratio": 0.0, "completions/max_length": 2864.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 2625.9375, "completions/mean_terminated_length": 2625.9375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.029357471968978643, "epoch": 4.0600001624000065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011717339046299458, "kl": 0.01776455994695425, "learning_rate": 7.462979994705325e-06, "loss": 0.0002, "num_tokens": 32561201.0, "reward": 0.2527275085449219, "reward_std": 27.661195755004883, "rewards/rollout_reward_func/mean": 0.2527275085449219, "rewards/rollout_reward_func/std": 27.661195755004883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.25, "sampling/sampling_logp_difference/mean": 0.24644286930561066, "step": 1015, "step_time": 37.36514089800767 }, { "clip_ratio/high_max": 0.004170925007201731, "clip_ratio/high_mean": 0.004170925007201731, "clip_ratio/low_mean": 0.004463854507775977, "clip_ratio/low_min": 0.004463854507775977, "clip_ratio/region_mean": 0.008634779485873878, "completions/clipped_ratio": 0.0, "completions/max_length": 2896.0, "completions/max_terminated_length": 2896.0, "completions/mean_length": 2741.3125, "completions/mean_terminated_length": 2741.3125, "completions/min_length": 2439.0, "completions/min_terminated_length": 2439.0, "entropy": 0.02921445411629975, "epoch": 4.064000162560006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0073621380142867565, "kl": 0.01570441306103021, "learning_rate": 7.462979994694503e-06, "loss": 0.0002, "num_tokens": 32617959.0, "reward": -8.452407836914062, "reward_std": 16.746461868286133, "rewards/rollout_reward_func/mean": -8.452407836914062, "rewards/rollout_reward_func/std": 16.746461868286133, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.093753814697266, "sampling/sampling_logp_difference/mean": 0.2480253279209137, "step": 1016, "step_time": 37.51236507498834 }, { "clip_ratio/high_max": 0.0034815575345419347, "clip_ratio/high_mean": 0.0034815575345419347, "clip_ratio/low_mean": 0.00403977875248529, "clip_ratio/low_min": 0.00403977875248529, "clip_ratio/region_mean": 0.0075213361997157335, "completions/clipped_ratio": 0.0, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 2594.4375, "completions/mean_terminated_length": 2594.4375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.030763066140934825, "epoch": 4.068000162720007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005524775944650173, "kl": 0.014789774548262358, "learning_rate": 7.46297999468367e-06, "loss": 0.0002, "num_tokens": 32672387.0, "reward": -1.134265661239624, "reward_std": 25.620935440063477, "rewards/rollout_reward_func/mean": -1.134265661239624, "rewards/rollout_reward_func/std": 25.620933532714844, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.40625, "sampling/sampling_logp_difference/mean": 0.2468082159757614, "step": 1017, "step_time": 37.46981919297832 }, { "clip_ratio/high_max": 0.0030498469131998718, "clip_ratio/high_mean": 0.0030498469131998718, "clip_ratio/low_mean": 0.006096180382883176, "clip_ratio/low_min": 0.006096180382883176, "clip_ratio/region_mean": 0.009146027325186878, "completions/clipped_ratio": 0.0, "completions/max_length": 2769.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 2552.0625, "completions/mean_terminated_length": 2552.0625, "completions/min_length": 1598.0, "completions/min_terminated_length": 1598.0, "entropy": 0.031091123586520553, "epoch": 4.0720001628800064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008264859206974506, "kl": 0.0191625386942178, "learning_rate": 7.462979994672825e-06, "loss": 0.0002, "num_tokens": 32726118.0, "reward": 18.848716735839844, "reward_std": 46.021461486816406, "rewards/rollout_reward_func/mean": 18.848716735839844, "rewards/rollout_reward_func/std": 46.021461486816406, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.78125, "sampling/sampling_logp_difference/mean": 0.2795681953430176, "step": 1018, "step_time": 36.452661022005486 }, { "clip_ratio/high_max": 0.004590736760292202, "clip_ratio/high_mean": 0.004590736760292202, "clip_ratio/low_mean": 0.003534949093591422, "clip_ratio/low_min": 0.003534949093591422, "clip_ratio/region_mean": 0.008125685853883624, "completions/clipped_ratio": 0.0, "completions/max_length": 2854.0, "completions/max_terminated_length": 2854.0, "completions/mean_length": 2688.75, "completions/mean_terminated_length": 2688.75, "completions/min_length": 2525.0, "completions/min_terminated_length": 2525.0, "entropy": 0.029526482801884413, "epoch": 4.076000163040007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0054216645658016205, "kl": 0.014143112814053893, "learning_rate": 7.46297999466197e-06, "loss": 0.0002, "num_tokens": 32782007.0, "reward": -9.31887435913086, "reward_std": 8.486753463745117, "rewards/rollout_reward_func/mean": -9.31887435913086, "rewards/rollout_reward_func/std": 8.486753463745117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.906253814697266, "sampling/sampling_logp_difference/mean": 0.24700021743774414, "step": 1019, "step_time": 38.070288549992256 }, { "clip_ratio/high_max": 0.00488626369042322, "clip_ratio/high_mean": 0.00488626369042322, "clip_ratio/low_mean": 0.0028924350044690073, "clip_ratio/low_min": 0.0028924350044690073, "clip_ratio/region_mean": 0.007778698694892228, "completions/clipped_ratio": 0.0, "completions/max_length": 2922.0, "completions/max_terminated_length": 2922.0, "completions/mean_length": 2788.25, "completions/mean_terminated_length": 2788.25, "completions/min_length": 2578.0, "completions/min_terminated_length": 2578.0, "entropy": 0.027905674651265144, "epoch": 4.0800001632000066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012151656672358513, "kl": 0.015995909459888935, "learning_rate": 7.4629799946511045e-06, "loss": 0.0002, "num_tokens": 32839549.0, "reward": -6.8961639404296875, "reward_std": 13.949142456054688, "rewards/rollout_reward_func/mean": -6.8961639404296875, "rewards/rollout_reward_func/std": 13.949143409729004, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.40625, "sampling/sampling_logp_difference/mean": 0.23944810032844543, "step": 1020, "step_time": 37.95126752500073 }, { "clip_ratio/high_max": 0.0048933212528936565, "clip_ratio/high_mean": 0.0048933212528936565, "clip_ratio/low_mean": 0.003158799692755565, "clip_ratio/low_min": 0.003158799692755565, "clip_ratio/region_mean": 0.008052120916545391, "completions/clipped_ratio": 0.0, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 2662.3125, "completions/mean_terminated_length": 2662.3125, "completions/min_length": 1604.0, "completions/min_terminated_length": 1604.0, "entropy": 0.030562093015760183, "epoch": 4.084000163360006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010322033427655697, "kl": 0.015191662474535406, "learning_rate": 7.462979994640227e-06, "loss": 0.0002, "num_tokens": 32895035.0, "reward": -1.2720890045166016, "reward_std": 36.265010833740234, "rewards/rollout_reward_func/mean": -1.2720890045166016, "rewards/rollout_reward_func/std": 36.265010833740234, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.171878814697266, "sampling/sampling_logp_difference/mean": 0.2467760145664215, "step": 1021, "step_time": 37.70260396800586 }, { "clip_ratio/high_max": 0.005148945638211444, "clip_ratio/high_mean": 0.005148945638211444, "clip_ratio/low_mean": 0.003369123471202329, "clip_ratio/low_min": 0.003369123471202329, "clip_ratio/region_mean": 0.008518069225829095, "completions/clipped_ratio": 0.0, "completions/max_length": 2921.0, "completions/max_terminated_length": 2921.0, "completions/mean_length": 2700.4375, "completions/mean_terminated_length": 2700.4375, "completions/min_length": 2551.0, "completions/min_terminated_length": 2551.0, "entropy": 0.03084377176128328, "epoch": 4.088000163520007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012656756676733494, "kl": 0.014839448616839945, "learning_rate": 7.462979994629338e-06, "loss": 0.0002, "num_tokens": 32951137.0, "reward": -3.5423521995544434, "reward_std": 21.342304229736328, "rewards/rollout_reward_func/mean": -3.5423521995544434, "rewards/rollout_reward_func/std": 21.342304229736328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.703125, "sampling/sampling_logp_difference/mean": 0.2519684433937073, "step": 1022, "step_time": 37.73864136097836 }, { "clip_ratio/high_max": 0.004554408078547567, "clip_ratio/high_mean": 0.004554408078547567, "clip_ratio/low_mean": 0.004155580798396841, "clip_ratio/low_min": 0.004155580798396841, "clip_ratio/region_mean": 0.008709988789632916, "completions/clipped_ratio": 0.0, "completions/max_length": 2891.0, "completions/max_terminated_length": 2891.0, "completions/mean_length": 2355.5625, "completions/mean_terminated_length": 2355.5625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.03355709812603891, "epoch": 4.0920001636800065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.23911166191101074, "kl": 0.0629272994119674, "learning_rate": 7.4629799946184395e-06, "loss": 0.0008, "num_tokens": 33001723.0, "reward": 13.709146499633789, "reward_std": 49.68862533569336, "rewards/rollout_reward_func/mean": 13.709146499633789, "rewards/rollout_reward_func/std": 49.688621520996094, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.5625, "sampling/sampling_logp_difference/mean": 0.25971519947052, "step": 1023, "step_time": 35.9866244920122 }, { "clip_ratio/high_max": 0.003375535292434506, "clip_ratio/high_mean": 0.003375535292434506, "clip_ratio/low_mean": 0.0045624068297911435, "clip_ratio/low_min": 0.0045624068297911435, "clip_ratio/region_mean": 0.007937942165881395, "completions/clipped_ratio": 0.0, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 2606.6875, "completions/mean_terminated_length": 2606.6875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.02867933246307075, "epoch": 4.096000163840006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007110403385013342, "kl": 0.013352646492421627, "learning_rate": 7.462979994607529e-06, "loss": 0.0002, "num_tokens": 33056350.0, "reward": -5.9924821853637695, "reward_std": 24.382144927978516, "rewards/rollout_reward_func/mean": -5.9924821853637695, "rewards/rollout_reward_func/std": 24.382144927978516, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.21875, "sampling/sampling_logp_difference/mean": 0.23859673738479614, "step": 1024, "step_time": 37.227270725983544 }, { "clip_ratio/high_max": 0.0032490902522113174, "clip_ratio/high_mean": 0.0032490902522113174, "clip_ratio/low_mean": 0.004088679270353168, "clip_ratio/low_min": 0.004088679270353168, "clip_ratio/region_mean": 0.007337769551668316, "completions/clipped_ratio": 0.0, "completions/max_length": 2798.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 2563.125, "completions/mean_terminated_length": 2563.125, "completions/min_length": 1462.0, "completions/min_terminated_length": 1462.0, "entropy": 0.031187445390969515, "epoch": 4.1000001640000067e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004548744764178991, "kl": 0.015177968423813581, "learning_rate": 7.462979994596607e-06, "loss": 0.0002, "num_tokens": 33110233.0, "reward": -0.3101768493652344, "reward_std": 35.38490676879883, "rewards/rollout_reward_func/mean": -0.3101768493652344, "rewards/rollout_reward_func/std": 35.38490676879883, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.62504196166992, "sampling/sampling_logp_difference/mean": 0.2565726637840271, "step": 1025, "step_time": 36.73094235402823 }, { "clip_ratio/high_max": 0.004022727836854756, "clip_ratio/high_mean": 0.004022727836854756, "clip_ratio/low_mean": 0.00422112169326283, "clip_ratio/low_min": 0.00422112169326283, "clip_ratio/region_mean": 0.008243849501013756, "completions/clipped_ratio": 0.0, "completions/max_length": 2906.0, "completions/max_terminated_length": 2906.0, "completions/mean_length": 2764.6875, "completions/mean_terminated_length": 2764.6875, "completions/min_length": 2561.0, "completions/min_terminated_length": 2561.0, "entropy": 0.030161626171320677, "epoch": 4.1040001641600064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006350718438625336, "kl": 0.01438739465083927, "learning_rate": 7.462979994585675e-06, "loss": 0.0002, "num_tokens": 33167379.0, "reward": -6.317413806915283, "reward_std": 10.157347679138184, "rewards/rollout_reward_func/mean": -6.317413806915283, "rewards/rollout_reward_func/std": 10.157347679138184, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.9375, "sampling/sampling_logp_difference/mean": 0.24640247225761414, "step": 1026, "step_time": 37.903516607984784 }, { "clip_ratio/high_max": 0.004344905450125225, "clip_ratio/high_mean": 0.004344905450125225, "clip_ratio/low_mean": 0.003964518051361665, "clip_ratio/low_min": 0.003964518051361665, "clip_ratio/region_mean": 0.008309423516038805, "completions/clipped_ratio": 0.0, "completions/max_length": 2875.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 2597.875, "completions/mean_terminated_length": 2597.875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.03037110809236765, "epoch": 4.108000164320007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007452994119375944, "kl": 0.01571794762276113, "learning_rate": 7.462979994574732e-06, "loss": 0.0002, "num_tokens": 33221854.0, "reward": -7.545384407043457, "reward_std": 28.185152053833008, "rewards/rollout_reward_func/mean": -7.545384407043457, "rewards/rollout_reward_func/std": 28.18515396118164, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.15625, "sampling/sampling_logp_difference/mean": 0.24212993681430817, "step": 1027, "step_time": 37.37324765200901 }, { "clip_ratio/high_max": 0.004278657812392339, "clip_ratio/high_mean": 0.004278657812392339, "clip_ratio/low_mean": 0.003116570442216471, "clip_ratio/low_min": 0.003116570442216471, "clip_ratio/region_mean": 0.007395228312816471, "completions/clipped_ratio": 0.0, "completions/max_length": 2912.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 2799.5, "completions/mean_terminated_length": 2799.5, "completions/min_length": 2685.0, "completions/min_terminated_length": 2685.0, "entropy": 0.02854717173613608, "epoch": 4.1120001644800066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008295872248709202, "kl": 0.014686775393784046, "learning_rate": 7.462979994563777e-06, "loss": 0.0002, "num_tokens": 33279596.0, "reward": -0.33020079135894775, "reward_std": 8.913201332092285, "rewards/rollout_reward_func/mean": -0.33020079135894775, "rewards/rollout_reward_func/std": 8.913201332092285, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.65625, "sampling/sampling_logp_difference/mean": 0.24196359515190125, "step": 1028, "step_time": 38.01332136998826 }, { "clip_ratio/high_max": 0.0036834660277236253, "clip_ratio/high_mean": 0.0036834660277236253, "clip_ratio/low_mean": 0.0034882044419646263, "clip_ratio/low_min": 0.0034882044419646263, "clip_ratio/region_mean": 0.007171670440584421, "completions/clipped_ratio": 0.0, "completions/max_length": 2908.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 2793.8125, "completions/mean_terminated_length": 2793.8125, "completions/min_length": 2628.0, "completions/min_terminated_length": 2628.0, "entropy": 0.02845464670099318, "epoch": 4.116000164640006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00580417737364769, "kl": 0.01506036811042577, "learning_rate": 7.462979994552811e-06, "loss": 0.0002, "num_tokens": 33337238.0, "reward": -7.883932113647461, "reward_std": 8.190228462219238, "rewards/rollout_reward_func/mean": -7.883932113647461, "rewards/rollout_reward_func/std": 8.190228462219238, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.875, "sampling/sampling_logp_difference/mean": 0.23731856048107147, "step": 1029, "step_time": 37.91423241903249 }, { "clip_ratio/high_max": 0.005102694281958975, "clip_ratio/high_mean": 0.005102694281958975, "clip_ratio/low_mean": 0.0034252921759616584, "clip_ratio/low_min": 0.0034252921759616584, "clip_ratio/region_mean": 0.00852798653068021, "completions/clipped_ratio": 0.0, "completions/max_length": 2900.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 2747.0, "completions/mean_terminated_length": 2747.0, "completions/min_length": 2571.0, "completions/min_terminated_length": 2571.0, "entropy": 0.030515017919242382, "epoch": 4.120000164800007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.014324570074677467, "kl": 0.022063027368858457, "learning_rate": 7.462979994541835e-06, "loss": 0.0003, "num_tokens": 33394099.0, "reward": -7.741986274719238, "reward_std": 10.097696304321289, "rewards/rollout_reward_func/mean": -7.741986274719238, "rewards/rollout_reward_func/std": 10.097696304321289, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.546875, "sampling/sampling_logp_difference/mean": 0.2476709485054016, "step": 1030, "step_time": 37.841612854987034 }, { "clip_ratio/high_max": 0.0021691583533538505, "clip_ratio/high_mean": 0.0021691583533538505, "clip_ratio/low_mean": 0.0053822115587536246, "clip_ratio/low_min": 0.0053822115587536246, "clip_ratio/region_mean": 0.00755136989755556, "completions/clipped_ratio": 0.0, "completions/max_length": 2930.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 2580.9375, "completions/mean_terminated_length": 2580.9375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.029016466811299324, "epoch": 4.1240001649600065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005243534687906504, "kl": 0.0167288325028494, "learning_rate": 7.462979994530847e-06, "loss": 0.0002, "num_tokens": 33448312.0, "reward": 5.02384090423584, "reward_std": 30.711990356445312, "rewards/rollout_reward_func/mean": 5.02384090423584, "rewards/rollout_reward_func/std": 30.711990356445312, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.8125, "sampling/sampling_logp_difference/mean": 0.23951344192028046, "step": 1031, "step_time": 37.07192819302145 }, { "clip_ratio/high_max": 0.002407003950793296, "clip_ratio/high_mean": 0.002407003950793296, "clip_ratio/low_mean": 0.004908450646325946, "clip_ratio/low_min": 0.004908450646325946, "clip_ratio/region_mean": 0.007315454655326903, "completions/clipped_ratio": 0.0, "completions/max_length": 2768.0, "completions/max_terminated_length": 2768.0, "completions/mean_length": 2504.8125, "completions/mean_terminated_length": 2504.8125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.03234475734643638, "epoch": 4.128000165120007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004361690953373909, "kl": 0.015638424665667117, "learning_rate": 7.4629799945198474e-06, "loss": 0.0002, "num_tokens": 33501290.0, "reward": 11.296991348266602, "reward_std": 41.824954986572266, "rewards/rollout_reward_func/mean": 11.296991348266602, "rewards/rollout_reward_func/std": 41.824954986572266, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.421875, "sampling/sampling_logp_difference/mean": 0.2562471926212311, "step": 1032, "step_time": 35.957955435005715 }, { "clip_ratio/high_max": 0.005743480927776545, "clip_ratio/high_mean": 0.005743480927776545, "clip_ratio/low_mean": 0.002893623517593369, "clip_ratio/low_min": 0.002893623517593369, "clip_ratio/region_mean": 0.008637104532681406, "completions/clipped_ratio": 0.0, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 2612.625, "completions/mean_terminated_length": 2612.625, "completions/min_length": 2542.0, "completions/min_terminated_length": 2542.0, "entropy": 0.0322243592236191, "epoch": 4.1320001652800066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.018551502376794815, "kl": 0.021975491428747773, "learning_rate": 7.462979994508838e-06, "loss": 0.0003, "num_tokens": 33555943.0, "reward": -7.113150596618652, "reward_std": 9.103316307067871, "rewards/rollout_reward_func/mean": -7.113150596618652, "rewards/rollout_reward_func/std": 9.103315353393555, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.15625, "sampling/sampling_logp_difference/mean": 0.25025680661201477, "step": 1033, "step_time": 36.37558404797164 }, { "clip_ratio/high_max": 0.004611624695826322, "clip_ratio/high_mean": 0.004611624695826322, "clip_ratio/low_mean": 0.0034657777287065983, "clip_ratio/low_min": 0.0034657777287065983, "clip_ratio/region_mean": 0.00807740242453292, "completions/clipped_ratio": 0.0, "completions/max_length": 2729.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 2454.3125, "completions/mean_terminated_length": 2454.3125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.033395782578736544, "epoch": 4.1360001654400064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010747799649834633, "kl": 0.01765990862622857, "learning_rate": 7.462979994497817e-06, "loss": 0.0002, "num_tokens": 33608054.0, "reward": -1.8911666870117188, "reward_std": 28.706335067749023, "rewards/rollout_reward_func/mean": -1.8911666870117188, "rewards/rollout_reward_func/std": 28.706335067749023, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.2880859375, "sampling/sampling_logp_difference/mean": 0.25236573815345764, "step": 1034, "step_time": 36.46771725600411 }, { "clip_ratio/high_max": 0.004459830379346386, "clip_ratio/high_mean": 0.004459830379346386, "clip_ratio/low_mean": 0.0032070834131445736, "clip_ratio/low_min": 0.0032070834131445736, "clip_ratio/region_mean": 0.007666913792490959, "completions/clipped_ratio": 0.0, "completions/max_length": 2800.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 2661.625, "completions/mean_terminated_length": 2661.625, "completions/min_length": 2563.0, "completions/min_terminated_length": 2563.0, "entropy": 0.030336230527609587, "epoch": 4.140000165600007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008558889850974083, "kl": 0.017864433233626187, "learning_rate": 7.462979994486785e-06, "loss": 0.0002, "num_tokens": 33663487.0, "reward": -7.62301778793335, "reward_std": 11.155172348022461, "rewards/rollout_reward_func/mean": -7.62301778793335, "rewards/rollout_reward_func/std": 11.155172348022461, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.875, "sampling/sampling_logp_difference/mean": 0.2539484202861786, "step": 1035, "step_time": 37.091130638014874 }, { "clip_ratio/high_max": 0.0034367743937764317, "clip_ratio/high_mean": 0.0034367743937764317, "clip_ratio/low_mean": 0.004434635629877448, "clip_ratio/low_min": 0.004434635629877448, "clip_ratio/region_mean": 0.00787140999455005, "completions/clipped_ratio": 0.0, "completions/max_length": 2876.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 2678.0625, "completions/mean_terminated_length": 2678.0625, "completions/min_length": 2566.0, "completions/min_terminated_length": 2566.0, "entropy": 0.029393795877695084, "epoch": 4.1440001657600065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006672416813671589, "kl": 0.0164415166946128, "learning_rate": 7.462979994475742e-06, "loss": 0.0002, "num_tokens": 33719220.0, "reward": -6.023880958557129, "reward_std": 7.282141208648682, "rewards/rollout_reward_func/mean": -6.023880958557129, "rewards/rollout_reward_func/std": 7.282141208648682, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.5, "sampling/sampling_logp_difference/mean": 0.24431250989437103, "step": 1036, "step_time": 38.10494598299556 }, { "clip_ratio/high_max": 0.003262154059484601, "clip_ratio/high_mean": 0.003262154059484601, "clip_ratio/low_mean": 0.004857242034631781, "clip_ratio/low_min": 0.004857242034631781, "clip_ratio/region_mean": 0.008119396050460637, "completions/clipped_ratio": 0.0, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 2570.25, "completions/mean_terminated_length": 2570.25, "completions/min_length": 1988.0, "completions/min_terminated_length": 1988.0, "entropy": 0.03036074200645089, "epoch": 4.148000165920007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007443538401275873, "kl": 0.015629837289452553, "learning_rate": 7.462979994464689e-06, "loss": 0.0002, "num_tokens": 33773218.0, "reward": -0.08507966995239258, "reward_std": 30.215818405151367, "rewards/rollout_reward_func/mean": -0.08507966995239258, "rewards/rollout_reward_func/std": 30.215822219848633, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.03125, "sampling/sampling_logp_difference/mean": 0.2618919312953949, "step": 1037, "step_time": 36.866705654989346 }, { "clip_ratio/high_max": 0.004186852107523009, "clip_ratio/high_mean": 0.004186852107523009, "clip_ratio/low_mean": 0.0038835020968690515, "clip_ratio/low_min": 0.0038835020968690515, "clip_ratio/region_mean": 0.00807035417528823, "completions/clipped_ratio": 0.0, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 2733.75, "completions/mean_terminated_length": 2733.75, "completions/min_length": 2547.0, "completions/min_terminated_length": 2547.0, "entropy": 0.03063804330304265, "epoch": 4.152000166080007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006647584028542042, "kl": 0.015468631754629314, "learning_rate": 7.462979994453623e-06, "loss": 0.0002, "num_tokens": 33829878.0, "reward": -9.678243637084961, "reward_std": 11.290266036987305, "rewards/rollout_reward_func/mean": -9.678243637084961, "rewards/rollout_reward_func/std": 11.290266036987305, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.59375, "sampling/sampling_logp_difference/mean": 0.24355193972587585, "step": 1038, "step_time": 37.69028476801759 }, { "clip_ratio/high_max": 0.004707811342086643, "clip_ratio/high_mean": 0.004707811342086643, "clip_ratio/low_mean": 0.003753919358132407, "clip_ratio/low_min": 0.003753919358132407, "clip_ratio/region_mean": 0.00846173067111522, "completions/clipped_ratio": 0.0, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 2638.1875, "completions/mean_terminated_length": 2638.1875, "completions/min_length": 1499.0, "completions/min_terminated_length": 1499.0, "entropy": 0.033119660802185535, "epoch": 4.1560001662400064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01022002100944519, "kl": 0.01718163024634123, "learning_rate": 7.462979994442546e-06, "loss": 0.0002, "num_tokens": 33884985.0, "reward": 0.02732086181640625, "reward_std": 34.56388473510742, "rewards/rollout_reward_func/mean": 0.02732086181640625, "rewards/rollout_reward_func/std": 34.56388854980469, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.34375, "sampling/sampling_logp_difference/mean": 0.25038468837738037, "step": 1039, "step_time": 37.52987858999404 }, { "clip_ratio/high_max": 0.004415668401634321, "clip_ratio/high_mean": 0.004415668401634321, "clip_ratio/low_mean": 0.003125298724626191, "clip_ratio/low_min": 0.003125298724626191, "clip_ratio/region_mean": 0.007540967199020088, "completions/clipped_ratio": 0.0, "completions/max_length": 2895.0, "completions/max_terminated_length": 2895.0, "completions/mean_length": 2738.6875, "completions/mean_terminated_length": 2738.6875, "completions/min_length": 2583.0, "completions/min_terminated_length": 2583.0, "entropy": 0.02906183572486043, "epoch": 4.160000166400007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006253200117498636, "kl": 0.016340541187673807, "learning_rate": 7.4629799944314595e-06, "loss": 0.0002, "num_tokens": 33941722.0, "reward": -12.490503311157227, "reward_std": 9.086947441101074, "rewards/rollout_reward_func/mean": -12.490503311157227, "rewards/rollout_reward_func/std": 9.086947441101074, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.625, "sampling/sampling_logp_difference/mean": 0.24441412091255188, "step": 1040, "step_time": 37.60437396299676 }, { "clip_ratio/high_max": 0.00461086438735947, "clip_ratio/high_mean": 0.00461086438735947, "clip_ratio/low_mean": 0.003681688045617193, "clip_ratio/low_min": 0.003681688045617193, "clip_ratio/region_mean": 0.008292552491184324, "completions/clipped_ratio": 0.0, "completions/max_length": 3108.0, "completions/max_terminated_length": 3108.0, "completions/mean_length": 2892.4375, "completions/mean_terminated_length": 2892.4375, "completions/min_length": 2759.0, "completions/min_terminated_length": 2759.0, "entropy": 0.03021391504444182, "epoch": 4.1640001665600066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008693547919392586, "kl": 0.015644886414520442, "learning_rate": 7.462979994420361e-06, "loss": 0.0002, "num_tokens": 34000878.0, "reward": -1.5512775182724, "reward_std": 10.510721206665039, "rewards/rollout_reward_func/mean": -1.5512775182724, "rewards/rollout_reward_func/std": 10.510721206665039, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.09375, "sampling/sampling_logp_difference/mean": 0.24392473697662354, "step": 1041, "step_time": 40.49886970498483 }, { "clip_ratio/high_max": 0.004003378300694749, "clip_ratio/high_mean": 0.004003378300694749, "clip_ratio/low_mean": 0.004050828982144594, "clip_ratio/low_min": 0.004050828982144594, "clip_ratio/region_mean": 0.008054207311943173, "completions/clipped_ratio": 0.0, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 2835.1875, "completions/mean_terminated_length": 2835.1875, "completions/min_length": 2592.0, "completions/min_terminated_length": 2592.0, "entropy": 0.03047833312302828, "epoch": 4.168000166720006e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008698491379618645, "kl": 0.01592630730010569, "learning_rate": 7.462979994409252e-06, "loss": 0.0002, "num_tokens": 34059118.0, "reward": 6.635374069213867, "reward_std": 26.529159545898438, "rewards/rollout_reward_func/mean": 6.635374069213867, "rewards/rollout_reward_func/std": 26.529159545898438, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.03125, "sampling/sampling_logp_difference/mean": 0.24986913800239563, "step": 1042, "step_time": 39.548221658027614 }, { "clip_ratio/high_max": 0.003891745815053582, "clip_ratio/high_mean": 0.003891745815053582, "clip_ratio/low_mean": 0.004566755553241819, "clip_ratio/low_min": 0.004566755553241819, "clip_ratio/region_mean": 0.008458501426503062, "completions/clipped_ratio": 0.0, "completions/max_length": 3053.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 2799.875, "completions/mean_terminated_length": 2799.875, "completions/min_length": 2273.0, "completions/min_terminated_length": 2273.0, "entropy": 0.03151545859873295, "epoch": 4.172000166880007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.09741608053445816, "kl": 0.031760359299369156, "learning_rate": 7.462979994398132e-06, "loss": 0.0004, "num_tokens": 34116794.0, "reward": 14.388385772705078, "reward_std": 50.87844467163086, "rewards/rollout_reward_func/mean": 14.388385772705078, "rewards/rollout_reward_func/std": 50.87844467163086, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.140625, "sampling/sampling_logp_difference/mean": 0.2627851366996765, "step": 1043, "step_time": 39.46246358602366 }, { "clip_ratio/high_max": 0.00400060327956453, "clip_ratio/high_mean": 0.00400060327956453, "clip_ratio/low_mean": 0.004295664868550375, "clip_ratio/low_min": 0.004295664868550375, "clip_ratio/region_mean": 0.008296268177218735, "completions/clipped_ratio": 0.0, "completions/max_length": 3114.0, "completions/max_terminated_length": 3114.0, "completions/mean_length": 2955.75, "completions/mean_terminated_length": 2955.75, "completions/min_length": 2703.0, "completions/min_terminated_length": 2703.0, "entropy": 0.02901784796267748, "epoch": 4.1760001670400065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012523858807981014, "kl": 0.01585029752459377, "learning_rate": 7.4629799943870005e-06, "loss": 0.0002, "num_tokens": 34177007.0, "reward": -1.573325276374817, "reward_std": 11.007741928100586, "rewards/rollout_reward_func/mean": -1.573325276374817, "rewards/rollout_reward_func/std": 11.007742881774902, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.781253814697266, "sampling/sampling_logp_difference/mean": 0.2445148080587387, "step": 1044, "step_time": 40.8169547389989 }, { "clip_ratio/high_max": 0.0030730817816220224, "clip_ratio/high_mean": 0.0030730817816220224, "clip_ratio/low_mean": 0.004329693736508489, "clip_ratio/low_min": 0.004329693736508489, "clip_ratio/region_mean": 0.00740277545992285, "completions/clipped_ratio": 0.0, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 2879.5, "completions/mean_terminated_length": 2879.5, "completions/min_length": 2453.0, "completions/min_terminated_length": 2453.0, "entropy": 0.02862015343271196, "epoch": 4.180000167200007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006035584025084972, "kl": 0.013317188946530223, "learning_rate": 7.462979994375858e-06, "loss": 0.0002, "num_tokens": 34235976.0, "reward": 5.687074661254883, "reward_std": 27.61423110961914, "rewards/rollout_reward_func/mean": 5.687074661254883, "rewards/rollout_reward_func/std": 27.614233016967773, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.83594512939453, "sampling/sampling_logp_difference/mean": 0.24683023989200592, "step": 1045, "step_time": 39.489238655005465 }, { "clip_ratio/high_max": 0.004713613132480532, "clip_ratio/high_mean": 0.004713613132480532, "clip_ratio/low_mean": 0.003210302529623732, "clip_ratio/low_min": 0.003210302529623732, "clip_ratio/region_mean": 0.007923915633000433, "completions/clipped_ratio": 0.0, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 2918.25, "completions/mean_terminated_length": 2918.25, "completions/min_length": 2790.0, "completions/min_terminated_length": 2790.0, "entropy": 0.029570756945759058, "epoch": 4.1840001673600067e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004650096409022808, "kl": 0.0150362781714648, "learning_rate": 7.462979994364706e-06, "loss": 0.0002, "num_tokens": 34295559.0, "reward": -0.7617596387863159, "reward_std": 10.172390937805176, "rewards/rollout_reward_func/mean": -0.7617596387863159, "rewards/rollout_reward_func/std": 10.172390937805176, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.94923782348633, "sampling/sampling_logp_difference/mean": 0.2448168843984604, "step": 1046, "step_time": 39.547561972998665 }, { "clip_ratio/high_max": 0.0021952941024210304, "clip_ratio/high_mean": 0.0021952941024210304, "clip_ratio/low_mean": 0.00478038951405324, "clip_ratio/low_min": 0.00478038951405324, "clip_ratio/region_mean": 0.006975683674681932, "completions/clipped_ratio": 0.0, "completions/max_length": 3148.0, "completions/max_terminated_length": 3148.0, "completions/mean_length": 2929.75, "completions/mean_terminated_length": 2929.75, "completions/min_length": 2493.0, "completions/min_terminated_length": 2493.0, "entropy": 0.027792751556262374, "epoch": 4.1880001675200064e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.03930896893143654, "kl": 0.016921638045459986, "learning_rate": 7.46297999435354e-06, "loss": 0.0002, "num_tokens": 34355351.0, "reward": 4.1331586837768555, "reward_std": 18.432771682739258, "rewards/rollout_reward_func/mean": 4.1331586837768555, "rewards/rollout_reward_func/std": 18.432771682739258, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.078125, "sampling/sampling_logp_difference/mean": 0.24035616219043732, "step": 1047, "step_time": 40.0579707090219 }, { "clip_ratio/high_max": 0.00367924285819754, "clip_ratio/high_mean": 0.00367924285819754, "clip_ratio/low_mean": 0.003894012770615518, "clip_ratio/low_min": 0.003894012770615518, "clip_ratio/region_mean": 0.007573255570605397, "completions/clipped_ratio": 0.0, "completions/max_length": 3140.0, "completions/max_terminated_length": 3140.0, "completions/mean_length": 2966.25, "completions/mean_terminated_length": 2966.25, "completions/min_length": 2798.0, "completions/min_terminated_length": 2798.0, "entropy": 0.02852563955821097, "epoch": 4.192000167680007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.06341102719306946, "kl": 0.017372631118632853, "learning_rate": 7.462979994342364e-06, "loss": 0.0003, "num_tokens": 34415741.0, "reward": -7.794526100158691, "reward_std": 9.701085090637207, "rewards/rollout_reward_func/mean": -7.794526100158691, "rewards/rollout_reward_func/std": 9.701086044311523, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.765625, "sampling/sampling_logp_difference/mean": 0.23545488715171814, "step": 1048, "step_time": 40.22327767100069 }, { "clip_ratio/high_max": 0.003195069875800982, "clip_ratio/high_mean": 0.003195069875800982, "clip_ratio/low_mean": 0.004544547409750521, "clip_ratio/low_min": 0.004544547409750521, "clip_ratio/region_mean": 0.007739617198240012, "completions/clipped_ratio": 0.0, "completions/max_length": 3151.0, "completions/max_terminated_length": 3151.0, "completions/mean_length": 2747.0, "completions/mean_terminated_length": 2747.0, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.030137971509248018, "epoch": 4.1960001678400066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003067341400310397, "kl": 0.012835874105803668, "learning_rate": 7.4629799943311785e-06, "loss": 0.0002, "num_tokens": 34472589.0, "reward": -2.906832695007324, "reward_std": 20.384057998657227, "rewards/rollout_reward_func/mean": -2.906832695007324, "rewards/rollout_reward_func/std": 20.384056091308594, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.003910064697266, "sampling/sampling_logp_difference/mean": 0.24289129674434662, "step": 1049, "step_time": 39.60443297999154 }, { "clip_ratio/high_max": 0.004318600578699261, "clip_ratio/high_mean": 0.004318600578699261, "clip_ratio/low_mean": 0.003568072512280196, "clip_ratio/low_min": 0.003568072512280196, "clip_ratio/region_mean": 0.007886673149187118, "completions/clipped_ratio": 0.0, "completions/max_length": 3110.0, "completions/max_terminated_length": 3110.0, "completions/mean_length": 3001.875, "completions/mean_terminated_length": 3001.875, "completions/min_length": 2842.0, "completions/min_terminated_length": 2842.0, "entropy": 0.02755984547547996, "epoch": 4.200000168000007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0060476502403616905, "kl": 0.013351162662729621, "learning_rate": 7.46297999431998e-06, "loss": 0.0002, "num_tokens": 34533551.0, "reward": -5.431593418121338, "reward_std": 8.27995777130127, "rewards/rollout_reward_func/mean": -5.431593418121338, "rewards/rollout_reward_func/std": 8.27995777130127, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.765625, "sampling/sampling_logp_difference/mean": 0.23497788608074188, "step": 1050, "step_time": 40.866318995002075 }, { "clip_ratio/high_max": 0.0029899508808739483, "clip_ratio/high_mean": 0.0029899508808739483, "clip_ratio/low_mean": 0.0046414439275395125, "clip_ratio/low_min": 0.0046414439275395125, "clip_ratio/region_mean": 0.00763139477930963, "completions/clipped_ratio": 0.0, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 2876.125, "completions/mean_terminated_length": 2876.125, "completions/min_length": 2007.0, "completions/min_terminated_length": 2007.0, "entropy": 0.030319620156660676, "epoch": 4.204000168160007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.012799231335520744, "kl": 0.015018873033113778, "learning_rate": 7.462979994308771e-06, "loss": 0.0002, "num_tokens": 34592461.0, "reward": 5.824012279510498, "reward_std": 32.49828338623047, "rewards/rollout_reward_func/mean": 5.824012279510498, "rewards/rollout_reward_func/std": 32.498287200927734, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 39.125, "sampling/sampling_logp_difference/mean": 0.2525031566619873, "step": 1051, "step_time": 39.69840143999318 }, { "clip_ratio/high_max": 0.00440683591295965, "clip_ratio/high_mean": 0.00440683591295965, "clip_ratio/low_mean": 0.003681459667859599, "clip_ratio/low_min": 0.003681459667859599, "clip_ratio/region_mean": 0.00808829563902691, "completions/clipped_ratio": 0.0, "completions/max_length": 3125.0, "completions/max_terminated_length": 3125.0, "completions/mean_length": 2775.8125, "completions/mean_terminated_length": 2775.8125, "completions/min_length": 1538.0, "completions/min_terminated_length": 1538.0, "entropy": 0.03032686631195247, "epoch": 4.2080001683200065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00377421616576612, "kl": 0.013748234952799976, "learning_rate": 7.462979994297552e-06, "loss": 0.0002, "num_tokens": 34649772.0, "reward": 4.64014196395874, "reward_std": 41.46749496459961, "rewards/rollout_reward_func/mean": 4.64014196395874, "rewards/rollout_reward_func/std": 41.46749496459961, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.3125, "sampling/sampling_logp_difference/mean": 0.25333666801452637, "step": 1052, "step_time": 40.17485860802117 }, { "clip_ratio/high_max": 0.0033064952585846186, "clip_ratio/high_mean": 0.0033064952585846186, "clip_ratio/low_mean": 0.004638490121578798, "clip_ratio/low_min": 0.004638490121578798, "clip_ratio/region_mean": 0.007944985351059586, "completions/clipped_ratio": 0.0, "completions/max_length": 3106.0, "completions/max_terminated_length": 3106.0, "completions/mean_length": 2873.75, "completions/mean_terminated_length": 2873.75, "completions/min_length": 2058.0, "completions/min_terminated_length": 2058.0, "entropy": 0.030495173996314406, "epoch": 4.212000168480007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003724457696080208, "kl": 0.012197356321848929, "learning_rate": 7.4629799942863204e-06, "loss": 0.0002, "num_tokens": 34708653.0, "reward": -0.12473821640014648, "reward_std": 34.348751068115234, "rewards/rollout_reward_func/mean": -0.12473821640014648, "rewards/rollout_reward_func/std": 34.3487548828125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.75, "sampling/sampling_logp_difference/mean": 0.2510945796966553, "step": 1053, "step_time": 40.61551339899597 }, { "clip_ratio/high_max": 0.0029971209296490997, "clip_ratio/high_mean": 0.0029971209296490997, "clip_ratio/low_mean": 0.005186807480640709, "clip_ratio/low_min": 0.005186807480640709, "clip_ratio/region_mean": 0.008183928381185979, "completions/clipped_ratio": 0.0, "completions/max_length": 3094.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 2787.75, "completions/mean_terminated_length": 2787.75, "completions/min_length": 1574.0, "completions/min_terminated_length": 1574.0, "entropy": 0.03163273353129625, "epoch": 4.2160001686400066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035336054861545563, "kl": 0.013934476883150637, "learning_rate": 7.462979994275079e-06, "loss": 0.0002, "num_tokens": 34766155.0, "reward": 7.309213638305664, "reward_std": 40.777976989746094, "rewards/rollout_reward_func/mean": 7.309213638305664, "rewards/rollout_reward_func/std": 40.77796936035156, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.046875, "sampling/sampling_logp_difference/mean": 0.25256267189979553, "step": 1054, "step_time": 40.103178304983885 }, { "clip_ratio/high_max": 0.00432459989679046, "clip_ratio/high_mean": 0.00432459989679046, "clip_ratio/low_mean": 0.0039529200294055045, "clip_ratio/low_min": 0.0039529200294055045, "clip_ratio/region_mean": 0.008277519838884473, "completions/clipped_ratio": 0.0, "completions/max_length": 2913.0, "completions/max_terminated_length": 2913.0, "completions/mean_length": 2810.0625, "completions/mean_terminated_length": 2810.0625, "completions/min_length": 2713.0, "completions/min_terminated_length": 2713.0, "entropy": 0.03088148357346654, "epoch": 4.220000168800007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007123843766748905, "kl": 0.014134539815131575, "learning_rate": 7.462979994263827e-06, "loss": 0.0002, "num_tokens": 34823963.0, "reward": -6.397205829620361, "reward_std": 5.911303997039795, "rewards/rollout_reward_func/mean": -6.397205829620361, "rewards/rollout_reward_func/std": 5.911303997039795, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.122379302978516, "sampling/sampling_logp_difference/mean": 0.24617142975330353, "step": 1055, "step_time": 39.081910988970776 }, { "clip_ratio/high_max": 0.003567206527804956, "clip_ratio/high_mean": 0.003567206527804956, "clip_ratio/low_mean": 0.003828933462500572, "clip_ratio/low_min": 0.003828933462500572, "clip_ratio/region_mean": 0.007396139961201698, "completions/clipped_ratio": 0.0, "completions/max_length": 3102.0, "completions/max_terminated_length": 3102.0, "completions/mean_length": 2888.25, "completions/mean_terminated_length": 2888.25, "completions/min_length": 2741.0, "completions/min_terminated_length": 2741.0, "entropy": 0.02975510898977518, "epoch": 4.224000168960007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006450233981013298, "kl": 0.014751105569303036, "learning_rate": 7.462979994252562e-06, "loss": 0.0002, "num_tokens": 34883066.0, "reward": 5.046970367431641, "reward_std": 14.738107681274414, "rewards/rollout_reward_func/mean": 5.046970367431641, "rewards/rollout_reward_func/std": 14.73810863494873, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.375, "sampling/sampling_logp_difference/mean": 0.24810364842414856, "step": 1056, "step_time": 40.63845861797745 }, { "clip_ratio/high_max": 0.0041219694539904594, "clip_ratio/high_mean": 0.0041219694539904594, "clip_ratio/low_mean": 0.003907512640580535, "clip_ratio/low_min": 0.003907512640580535, "clip_ratio/region_mean": 0.008029482094570994, "completions/clipped_ratio": 0.0, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 2863.25, "completions/mean_terminated_length": 2863.25, "completions/min_length": 2760.0, "completions/min_terminated_length": 2760.0, "entropy": 0.029443777399137616, "epoch": 4.2280001691200065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00904869381338358, "kl": 0.014922004542313516, "learning_rate": 7.462979994241287e-06, "loss": 0.0002, "num_tokens": 34941753.0, "reward": -8.579017639160156, "reward_std": 15.026400566101074, "rewards/rollout_reward_func/mean": -8.579017639160156, "rewards/rollout_reward_func/std": 15.02640151977539, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.9140625, "sampling/sampling_logp_difference/mean": 0.25376781821250916, "step": 1057, "step_time": 39.780482009009575 }, { "clip_ratio/high_max": 0.004958972189342603, "clip_ratio/high_mean": 0.004958972189342603, "clip_ratio/low_mean": 0.003285269485786557, "clip_ratio/low_min": 0.003285269485786557, "clip_ratio/region_mean": 0.00824424170423299, "completions/clipped_ratio": 0.0, "completions/max_length": 3090.0, "completions/max_terminated_length": 3090.0, "completions/mean_length": 2925.3125, "completions/mean_terminated_length": 2925.3125, "completions/min_length": 2770.0, "completions/min_terminated_length": 2770.0, "entropy": 0.028149138437584043, "epoch": 4.232000169280007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.18276241421699524, "kl": 0.037526876432821155, "learning_rate": 7.462979994230002e-06, "loss": 0.0005, "num_tokens": 35001459.0, "reward": -1.2699416875839233, "reward_std": 13.807193756103516, "rewards/rollout_reward_func/mean": -1.2699416875839233, "rewards/rollout_reward_func/std": 13.807193756103516, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.125, "sampling/sampling_logp_difference/mean": 0.24797405302524567, "step": 1058, "step_time": 40.566515501006506 }, { "clip_ratio/high_max": 0.00354744924698025, "clip_ratio/high_mean": 0.00354744924698025, "clip_ratio/low_mean": 0.003714265680173412, "clip_ratio/low_min": 0.003714265680173412, "clip_ratio/region_mean": 0.007261714956257492, "completions/clipped_ratio": 0.0, "completions/max_length": 3150.0, "completions/max_terminated_length": 3150.0, "completions/mean_length": 2827.625, "completions/mean_terminated_length": 2827.625, "completions/min_length": 1240.0, "completions/min_terminated_length": 1240.0, "entropy": 0.028942867880687118, "epoch": 4.236000169440007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007105860859155655, "kl": 0.012523058918304741, "learning_rate": 7.4629799942187035e-06, "loss": 0.0002, "num_tokens": 35059600.0, "reward": 9.750701904296875, "reward_std": 32.419918060302734, "rewards/rollout_reward_func/mean": 9.750701904296875, "rewards/rollout_reward_func/std": 32.419921875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.45703125, "sampling/sampling_logp_difference/mean": 0.254063218832016, "step": 1059, "step_time": 39.66865835497447 }, { "clip_ratio/high_max": 0.003593198722228408, "clip_ratio/high_mean": 0.003593198722228408, "clip_ratio/low_mean": 0.003370353631908074, "clip_ratio/low_min": 0.003370353631908074, "clip_ratio/region_mean": 0.006963552499655634, "completions/clipped_ratio": 0.0, "completions/max_length": 3076.0, "completions/max_terminated_length": 3076.0, "completions/mean_length": 2962.125, "completions/mean_terminated_length": 2962.125, "completions/min_length": 2760.0, "completions/min_terminated_length": 2760.0, "entropy": 0.027981041464954615, "epoch": 4.240000169600007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006235647015273571, "kl": 0.013978470582515001, "learning_rate": 7.462979994207396e-06, "loss": 0.0002, "num_tokens": 35119919.0, "reward": -3.719797372817993, "reward_std": 11.483839988708496, "rewards/rollout_reward_func/mean": -3.719797372817993, "rewards/rollout_reward_func/std": 11.483839988708496, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.234375, "sampling/sampling_logp_difference/mean": 0.23664136230945587, "step": 1060, "step_time": 40.58501494397933 }, { "clip_ratio/high_max": 0.005214728182181716, "clip_ratio/high_mean": 0.005214728182181716, "clip_ratio/low_mean": 0.002847732452210039, "clip_ratio/low_min": 0.002847732452210039, "clip_ratio/region_mean": 0.008062460517976433, "completions/clipped_ratio": 0.0, "completions/max_length": 3118.0, "completions/max_terminated_length": 3118.0, "completions/mean_length": 2945.9375, "completions/mean_terminated_length": 2945.9375, "completions/min_length": 2761.0, "completions/min_terminated_length": 2761.0, "entropy": 0.028458287240937352, "epoch": 4.244000169760007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011640074662864208, "kl": 0.01431273715570569, "learning_rate": 7.4629799941960776e-06, "loss": 0.0002, "num_tokens": 35179977.0, "reward": -4.197793960571289, "reward_std": 6.5335822105407715, "rewards/rollout_reward_func/mean": -4.197793960571289, "rewards/rollout_reward_func/std": 6.5335822105407715, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.015625, "sampling/sampling_logp_difference/mean": 0.24823063611984253, "step": 1061, "step_time": 40.59102374500071 }, { "clip_ratio/high_max": 0.004397650569444522, "clip_ratio/high_mean": 0.004397650569444522, "clip_ratio/low_mean": 0.0019668000313686207, "clip_ratio/low_min": 0.0019668000313686207, "clip_ratio/region_mean": 0.006364450557157397, "completions/clipped_ratio": 0.0, "completions/max_length": 3107.0, "completions/max_terminated_length": 3107.0, "completions/mean_length": 2977.5625, "completions/mean_terminated_length": 2977.5625, "completions/min_length": 2855.0, "completions/min_terminated_length": 2855.0, "entropy": 0.028759153792634606, "epoch": 4.2480001699200066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004532418213784695, "kl": 0.011776114464737475, "learning_rate": 7.462979994184746e-06, "loss": 0.0002, "num_tokens": 35240567.0, "reward": -1.7706992626190186, "reward_std": 10.615839004516602, "rewards/rollout_reward_func/mean": -1.7706992626190186, "rewards/rollout_reward_func/std": 10.615839004516602, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.03125, "sampling/sampling_logp_difference/mean": 0.23498839139938354, "step": 1062, "step_time": 40.9370981920074 }, { "clip_ratio/high_max": 0.0021871375502087176, "clip_ratio/high_mean": 0.0021871375502087176, "clip_ratio/low_mean": 0.005048049468314275, "clip_ratio/low_min": 0.005048049468314275, "clip_ratio/region_mean": 0.007235187105834484, "completions/clipped_ratio": 0.0, "completions/max_length": 3114.0, "completions/max_terminated_length": 3114.0, "completions/mean_length": 2595.3125, "completions/mean_terminated_length": 2595.3125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.03256926336325705, "epoch": 4.252000170080007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006944979541003704, "kl": 0.014284203993156552, "learning_rate": 7.462979994173405e-06, "loss": 0.0002, "num_tokens": 35294993.0, "reward": 18.308292388916016, "reward_std": 43.70484161376953, "rewards/rollout_reward_func/mean": 18.308292388916016, "rewards/rollout_reward_func/std": 43.70484161376953, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.35573196411133, "sampling/sampling_logp_difference/mean": 0.2661041021347046, "step": 1063, "step_time": 39.63210074001108 }, { "clip_ratio/high_max": 0.003997801657533273, "clip_ratio/high_mean": 0.003997801657533273, "clip_ratio/low_mean": 0.003913414373528212, "clip_ratio/low_min": 0.003913414373528212, "clip_ratio/region_mean": 0.007911216001957655, "completions/clipped_ratio": 0.0, "completions/max_length": 3120.0, "completions/max_terminated_length": 3120.0, "completions/mean_length": 2907.9375, "completions/mean_terminated_length": 2907.9375, "completions/min_length": 2796.0, "completions/min_terminated_length": 2796.0, "entropy": 0.029178711585700512, "epoch": 4.256000170240007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004712756723165512, "kl": 0.013307249289937317, "learning_rate": 7.462979994162053e-06, "loss": 0.0002, "num_tokens": 35354436.0, "reward": -5.9773945808410645, "reward_std": 8.257695198059082, "rewards/rollout_reward_func/mean": -5.9773945808410645, "rewards/rollout_reward_func/std": 8.257695198059082, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.240234375, "sampling/sampling_logp_difference/mean": 0.24643169343471527, "step": 1064, "step_time": 40.80883769901993 }, { "clip_ratio/high_max": 0.0019587986753322184, "clip_ratio/high_mean": 0.0019587986753322184, "clip_ratio/low_mean": 0.005420589412096888, "clip_ratio/low_min": 0.005420589412096888, "clip_ratio/region_mean": 0.007379388029221445, "completions/clipped_ratio": 0.0, "completions/max_length": 3145.0, "completions/max_terminated_length": 3145.0, "completions/mean_length": 2932.6875, "completions/mean_terminated_length": 2932.6875, "completions/min_length": 1949.0, "completions/min_terminated_length": 1949.0, "entropy": 0.03006536583416164, "epoch": 4.2600001704000065e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004933414980769157, "kl": 0.013052922207862139, "learning_rate": 7.462979994150689e-06, "loss": 0.0002, "num_tokens": 35414296.0, "reward": 8.123594284057617, "reward_std": 44.43642807006836, "rewards/rollout_reward_func/mean": 8.123594284057617, "rewards/rollout_reward_func/std": 44.436424255371094, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 60.146484375, "sampling/sampling_logp_difference/mean": 0.25027409195899963, "step": 1065, "step_time": 40.14119354500144 }, { "clip_ratio/high_max": 0.0036116200499236584, "clip_ratio/high_mean": 0.0036116200499236584, "clip_ratio/low_mean": 0.004075466946233064, "clip_ratio/low_min": 0.004075466946233064, "clip_ratio/region_mean": 0.007687087054364383, "completions/clipped_ratio": 0.0, "completions/max_length": 3107.0, "completions/max_terminated_length": 3107.0, "completions/mean_length": 2907.0625, "completions/mean_terminated_length": 2907.0625, "completions/min_length": 2737.0, "completions/min_terminated_length": 2737.0, "entropy": 0.029263437492772937, "epoch": 4.264000170560007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01575707457959652, "kl": 0.015272566699422896, "learning_rate": 7.4629799941393145e-06, "loss": 0.0002, "num_tokens": 35473698.0, "reward": -0.14712166786193848, "reward_std": 13.891862869262695, "rewards/rollout_reward_func/mean": -0.14712166786193848, "rewards/rollout_reward_func/std": 13.891862869262695, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.96875, "sampling/sampling_logp_difference/mean": 0.2500799298286438, "step": 1066, "step_time": 40.84190516198578 }, { "clip_ratio/high_max": 0.005535860516829416, "clip_ratio/high_mean": 0.005535860516829416, "clip_ratio/low_mean": 0.002823823655489832, "clip_ratio/low_min": 0.002823823655489832, "clip_ratio/region_mean": 0.008359684143215418, "completions/clipped_ratio": 0.0, "completions/max_length": 2962.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 2700.5, "completions/mean_terminated_length": 2700.5, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.02998408768326044, "epoch": 4.268000170720007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.030430732294917107, "kl": 0.017518588108941913, "learning_rate": 7.4629799941279295e-06, "loss": 0.0002, "num_tokens": 35529793.0, "reward": -3.1370396614074707, "reward_std": 25.207773208618164, "rewards/rollout_reward_func/mean": -3.1370396614074707, "rewards/rollout_reward_func/std": 25.207775115966797, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.625, "sampling/sampling_logp_difference/mean": 0.24647879600524902, "step": 1067, "step_time": 39.037057237001136 }, { "clip_ratio/high_max": 0.003176691447151825, "clip_ratio/high_mean": 0.003176691447151825, "clip_ratio/low_mean": 0.004766974569065496, "clip_ratio/low_min": 0.004766974569065496, "clip_ratio/region_mean": 0.00794366595800966, "completions/clipped_ratio": 0.0, "completions/max_length": 3119.0, "completions/max_terminated_length": 3119.0, "completions/mean_length": 2940.1875, "completions/mean_terminated_length": 2940.1875, "completions/min_length": 2772.0, "completions/min_terminated_length": 2772.0, "entropy": 0.028977265814319253, "epoch": 4.272000170880007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004529498517513275, "kl": 0.011679703136906028, "learning_rate": 7.462979994116533e-06, "loss": 0.0002, "num_tokens": 35589743.0, "reward": -4.122562885284424, "reward_std": 9.945459365844727, "rewards/rollout_reward_func/mean": -4.122562885284424, "rewards/rollout_reward_func/std": 9.945459365844727, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.0, "sampling/sampling_logp_difference/mean": 0.24792960286140442, "step": 1068, "step_time": 40.84213707500021 }, { "clip_ratio/high_max": 0.004083456675289199, "clip_ratio/high_mean": 0.004083456675289199, "clip_ratio/low_mean": 0.0035594037908595055, "clip_ratio/low_min": 0.0035594037908595055, "clip_ratio/region_mean": 0.007642860466148704, "completions/clipped_ratio": 0.0, "completions/max_length": 3131.0, "completions/max_terminated_length": 3131.0, "completions/mean_length": 2872.5, "completions/mean_terminated_length": 2872.5, "completions/min_length": 2731.0, "completions/min_terminated_length": 2731.0, "entropy": 0.02989386348053813, "epoch": 4.276000171040007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004688775632530451, "kl": 0.011737294145859778, "learning_rate": 7.462979994105125e-06, "loss": 0.0002, "num_tokens": 35648582.0, "reward": -3.236926317214966, "reward_std": 10.440980911254883, "rewards/rollout_reward_func/mean": -3.236926317214966, "rewards/rollout_reward_func/std": 10.4409818649292, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.28125, "sampling/sampling_logp_difference/mean": 0.24859456717967987, "step": 1069, "step_time": 40.824743887002114 }, { "clip_ratio/high_max": 0.004831591388210654, "clip_ratio/high_mean": 0.004831591388210654, "clip_ratio/low_mean": 0.003221314458642155, "clip_ratio/low_min": 0.003221314458642155, "clip_ratio/region_mean": 0.008052906021475792, "completions/clipped_ratio": 0.0, "completions/max_length": 3079.0, "completions/max_terminated_length": 3079.0, "completions/mean_length": 2920.0, "completions/mean_terminated_length": 2920.0, "completions/min_length": 2752.0, "completions/min_terminated_length": 2752.0, "entropy": 0.030048172222450376, "epoch": 4.2800001712000066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003922916017472744, "kl": 0.01492852729279548, "learning_rate": 7.462979994093707e-06, "loss": 0.0002, "num_tokens": 35708199.0, "reward": -3.1000704765319824, "reward_std": 12.726787567138672, "rewards/rollout_reward_func/mean": -3.1000704765319824, "rewards/rollout_reward_func/std": 12.726787567138672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.0, "sampling/sampling_logp_difference/mean": 0.25027981400489807, "step": 1070, "step_time": 40.25368552899454 }, { "clip_ratio/high_max": 0.004237086541252211, "clip_ratio/high_mean": 0.004237086541252211, "clip_ratio/low_mean": 0.0033805931016104296, "clip_ratio/low_min": 0.0033805931016104296, "clip_ratio/region_mean": 0.007617679599206895, "completions/clipped_ratio": 0.0, "completions/max_length": 3096.0, "completions/max_terminated_length": 3096.0, "completions/mean_length": 2864.375, "completions/mean_terminated_length": 2864.375, "completions/min_length": 1607.0, "completions/min_terminated_length": 1607.0, "entropy": 0.030035297153517604, "epoch": 4.284000171360007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00789997074753046, "kl": 0.014004623517394066, "learning_rate": 7.462979994082277e-06, "loss": 0.0002, "num_tokens": 35766949.0, "reward": 7.793142318725586, "reward_std": 32.59622573852539, "rewards/rollout_reward_func/mean": 7.793142318725586, "rewards/rollout_reward_func/std": 32.59622573852539, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.875, "sampling/sampling_logp_difference/mean": 0.2509509027004242, "step": 1071, "step_time": 39.90147485799389 }, { "clip_ratio/high_max": 0.004467167804250494, "clip_ratio/high_mean": 0.004467167804250494, "clip_ratio/low_mean": 0.003330154679133557, "clip_ratio/low_min": 0.003330154679133557, "clip_ratio/region_mean": 0.007797322527039796, "completions/clipped_ratio": 0.0, "completions/max_length": 2976.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 2812.25, "completions/mean_terminated_length": 2812.25, "completions/min_length": 2507.0, "completions/min_terminated_length": 2507.0, "entropy": 0.02910860045813024, "epoch": 4.288000171520007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004962165839970112, "kl": 0.013478630688041449, "learning_rate": 7.462979994070836e-06, "loss": 0.0002, "num_tokens": 35824801.0, "reward": -0.19727861881256104, "reward_std": 20.851806640625, "rewards/rollout_reward_func/mean": -0.19727861881256104, "rewards/rollout_reward_func/std": 20.851808547973633, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.75, "sampling/sampling_logp_difference/mean": 0.2523071765899658, "step": 1072, "step_time": 39.63769537000917 }, { "clip_ratio/high_max": 0.004980223660822958, "clip_ratio/high_mean": 0.004980223660822958, "clip_ratio/low_mean": 0.0035677222767844796, "clip_ratio/low_min": 0.0035677222767844796, "clip_ratio/region_mean": 0.00854794605402276, "completions/clipped_ratio": 0.0, "completions/max_length": 3151.0, "completions/max_terminated_length": 3151.0, "completions/mean_length": 3050.125, "completions/mean_terminated_length": 3050.125, "completions/min_length": 2825.0, "completions/min_terminated_length": 2825.0, "entropy": 0.027805997524410486, "epoch": 4.292000171680007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0029627473559230566, "kl": 0.012669483781792223, "learning_rate": 7.462979994059384e-06, "loss": 0.0002, "num_tokens": 35886560.0, "reward": 3.21252179145813, "reward_std": 17.792875289916992, "rewards/rollout_reward_func/mean": 3.21252179145813, "rewards/rollout_reward_func/std": 17.792877197265625, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.515625, "sampling/sampling_logp_difference/mean": 0.24053040146827698, "step": 1073, "step_time": 40.15549427000224 }, { "clip_ratio/high_max": 0.003452246921369806, "clip_ratio/high_mean": 0.003452246921369806, "clip_ratio/low_mean": 0.005033728230046108, "clip_ratio/low_min": 0.005033728230046108, "clip_ratio/region_mean": 0.008485975035000592, "completions/clipped_ratio": 0.0, "completions/max_length": 3049.0, "completions/max_terminated_length": 3049.0, "completions/mean_length": 2803.125, "completions/mean_terminated_length": 2803.125, "completions/min_length": 2230.0, "completions/min_terminated_length": 2230.0, "entropy": 0.03049112670123577, "epoch": 4.296000171840007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032545188441872597, "kl": 0.012950935924891382, "learning_rate": 7.462979994047921e-06, "loss": 0.0002, "num_tokens": 35944287.0, "reward": 3.611121654510498, "reward_std": 18.667150497436523, "rewards/rollout_reward_func/mean": 3.611121654510498, "rewards/rollout_reward_func/std": 18.667152404785156, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.625, "sampling/sampling_logp_difference/mean": 0.25754281878471375, "step": 1074, "step_time": 39.08053669097717 }, { "clip_ratio/high_max": 0.0025934631121344864, "clip_ratio/high_mean": 0.0025934631121344864, "clip_ratio/low_mean": 0.004857688763877377, "clip_ratio/low_min": 0.004857688763877377, "clip_ratio/region_mean": 0.007451151730492711, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 2649.5, "completions/mean_terminated_length": 2649.5, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.030427440535277128, "epoch": 4.3000001720000066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005931057967245579, "kl": 0.015709395636804402, "learning_rate": 7.462979994036447e-06, "loss": 0.0002, "num_tokens": 35999539.0, "reward": 1.6853736639022827, "reward_std": 26.10462760925293, "rewards/rollout_reward_func/mean": 1.6853736639022827, "rewards/rollout_reward_func/std": 26.10462760925293, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.18762969970703, "sampling/sampling_logp_difference/mean": 0.2555425465106964, "step": 1075, "step_time": 37.83815855998546 }, { "clip_ratio/high_max": 0.0032811734126880765, "clip_ratio/high_mean": 0.0032811734126880765, "clip_ratio/low_mean": 0.005084445612737909, "clip_ratio/low_min": 0.005084445612737909, "clip_ratio/region_mean": 0.008365619054529816, "completions/clipped_ratio": 0.0, "completions/max_length": 2966.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 2810.9375, "completions/mean_terminated_length": 2810.9375, "completions/min_length": 2610.0, "completions/min_terminated_length": 2610.0, "entropy": 0.030471575679257512, "epoch": 4.304000172160007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005699950270354748, "kl": 0.014071598299778998, "learning_rate": 7.462979994024963e-06, "loss": 0.0002, "num_tokens": 36057389.0, "reward": 1.6169462203979492, "reward_std": 33.59344482421875, "rewards/rollout_reward_func/mean": 1.6169462203979492, "rewards/rollout_reward_func/std": 33.593448638916016, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.234375, "sampling/sampling_logp_difference/mean": 0.2621968388557434, "step": 1076, "step_time": 39.263560542996856 }, { "clip_ratio/high_max": 0.003093225066550076, "clip_ratio/high_mean": 0.003093225066550076, "clip_ratio/low_mean": 0.004322806489653885, "clip_ratio/low_min": 0.004322806489653885, "clip_ratio/region_mean": 0.007416031614411622, "completions/clipped_ratio": 0.0, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 2848.25, "completions/mean_terminated_length": 2848.25, "completions/min_length": 2779.0, "completions/min_terminated_length": 2779.0, "entropy": 0.029827693244442344, "epoch": 4.308000172320007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003269489388912916, "kl": 0.012593881343491375, "learning_rate": 7.4629799940134666e-06, "loss": 0.0002, "num_tokens": 36115841.0, "reward": -2.299649715423584, "reward_std": 13.988448143005371, "rewards/rollout_reward_func/mean": -2.299649715423584, "rewards/rollout_reward_func/std": 13.988449096679688, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.8125, "sampling/sampling_logp_difference/mean": 0.24951505661010742, "step": 1077, "step_time": 39.531843536984525 }, { "clip_ratio/high_max": 0.003038847498828545, "clip_ratio/high_mean": 0.003038847498828545, "clip_ratio/low_mean": 0.005060004739789292, "clip_ratio/low_min": 0.005060004739789292, "clip_ratio/region_mean": 0.008098852180410177, "completions/clipped_ratio": 0.0, "completions/max_length": 3138.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 2793.4375, "completions/mean_terminated_length": 2793.4375, "completions/min_length": 1136.0, "completions/min_terminated_length": 1136.0, "entropy": 0.03056667186319828, "epoch": 4.312000172480007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004015825688838959, "kl": 0.011792707839049399, "learning_rate": 7.46297999400196e-06, "loss": 0.0002, "num_tokens": 36173433.0, "reward": 7.096505165100098, "reward_std": 32.589725494384766, "rewards/rollout_reward_func/mean": 7.096505165100098, "rewards/rollout_reward_func/std": 32.589725494384766, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.3125, "sampling/sampling_logp_difference/mean": 0.25353559851646423, "step": 1078, "step_time": 39.310459390995675 }, { "clip_ratio/high_max": 0.005207132373470813, "clip_ratio/high_mean": 0.005207132373470813, "clip_ratio/low_mean": 0.002866177004761994, "clip_ratio/low_min": 0.002866177004761994, "clip_ratio/region_mean": 0.008073309378232807, "completions/clipped_ratio": 0.0, "completions/max_length": 3108.0, "completions/max_terminated_length": 3108.0, "completions/mean_length": 2960.375, "completions/mean_terminated_length": 2960.375, "completions/min_length": 2748.0, "completions/min_terminated_length": 2748.0, "entropy": 0.027562136529013515, "epoch": 4.316000172640007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003544720122590661, "kl": 0.011363665689714253, "learning_rate": 7.462979993990442e-06, "loss": 0.0002, "num_tokens": 36233714.0, "reward": -1.1491247415542603, "reward_std": 9.193671226501465, "rewards/rollout_reward_func/mean": -1.1491247415542603, "rewards/rollout_reward_func/std": 9.193671226501465, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.40625, "sampling/sampling_logp_difference/mean": 0.23939117789268494, "step": 1079, "step_time": 40.36701954298769 }, { "clip_ratio/high_max": 0.00413861675770022, "clip_ratio/high_mean": 0.00413861675770022, "clip_ratio/low_mean": 0.003956347733037546, "clip_ratio/low_min": 0.003956347733037546, "clip_ratio/region_mean": 0.008094964490737766, "completions/clipped_ratio": 0.0, "completions/max_length": 3100.0, "completions/max_terminated_length": 3100.0, "completions/mean_length": 2795.3125, "completions/mean_terminated_length": 2795.3125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.029555623419582844, "epoch": 4.320000172800007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005171983502805233, "kl": 0.013518271851353347, "learning_rate": 7.462979993978913e-06, "loss": 0.0002, "num_tokens": 36291363.0, "reward": 6.017865180969238, "reward_std": 26.750017166137695, "rewards/rollout_reward_func/mean": 6.017865180969238, "rewards/rollout_reward_func/std": 26.750019073486328, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 58.28125, "sampling/sampling_logp_difference/mean": 0.2463841438293457, "step": 1080, "step_time": 39.744123666969244 }, { "clip_ratio/high_max": 0.0038085944834165275, "clip_ratio/high_mean": 0.0038085944834165275, "clip_ratio/low_mean": 0.0037204570253379643, "clip_ratio/low_min": 0.0037204570253379643, "clip_ratio/region_mean": 0.007529051450546831, "completions/clipped_ratio": 0.0, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 2863.8125, "completions/mean_terminated_length": 2863.8125, "completions/min_length": 2766.0, "completions/min_terminated_length": 2766.0, "entropy": 0.029559491435065866, "epoch": 4.324000172960007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003224870888516307, "kl": 0.013256249367259443, "learning_rate": 7.462979993967373e-06, "loss": 0.0002, "num_tokens": 36350066.0, "reward": -3.1538643836975098, "reward_std": 13.878722190856934, "rewards/rollout_reward_func/mean": -3.1538643836975098, "rewards/rollout_reward_func/std": 13.87872314453125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.375, "sampling/sampling_logp_difference/mean": 0.24308070540428162, "step": 1081, "step_time": 39.65001612799824 }, { "clip_ratio/high_max": 0.00548393715871498, "clip_ratio/high_mean": 0.00548393715871498, "clip_ratio/low_mean": 0.0028571636939886957, "clip_ratio/low_min": 0.0028571636939886957, "clip_ratio/region_mean": 0.008341100823599845, "completions/clipped_ratio": 0.0, "completions/max_length": 3091.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 2957.0, "completions/mean_terminated_length": 2957.0, "completions/min_length": 2790.0, "completions/min_terminated_length": 2790.0, "entropy": 0.0277847854886204, "epoch": 4.328000173120007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005468583200126886, "kl": 0.012324000941589475, "learning_rate": 7.462979993955821e-06, "loss": 0.0002, "num_tokens": 36410302.0, "reward": 1.451819658279419, "reward_std": 14.945761680603027, "rewards/rollout_reward_func/mean": 1.451819658279419, "rewards/rollout_reward_func/std": 14.945761680603027, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.5625, "sampling/sampling_logp_difference/mean": 0.23973745107650757, "step": 1082, "step_time": 40.64697977199103 }, { "clip_ratio/high_max": 0.0035348731908015907, "clip_ratio/high_mean": 0.0035348731908015907, "clip_ratio/low_mean": 0.0043223372485954314, "clip_ratio/low_min": 0.0043223372485954314, "clip_ratio/region_mean": 0.007857210468500853, "completions/clipped_ratio": 0.0, "completions/max_length": 3172.0, "completions/max_terminated_length": 3172.0, "completions/mean_length": 2744.5625, "completions/mean_terminated_length": 2744.5625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.031077078310772777, "epoch": 4.3320001732800066e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004708727356046438, "kl": 0.012791173183359206, "learning_rate": 7.462979993944259e-06, "loss": 0.0002, "num_tokens": 36467122.0, "reward": 1.985855221748352, "reward_std": 28.44521141052246, "rewards/rollout_reward_func/mean": 1.985855221748352, "rewards/rollout_reward_func/std": 28.445215225219727, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.15625, "sampling/sampling_logp_difference/mean": 0.2555074989795685, "step": 1083, "step_time": 44.49307025699818 }, { "clip_ratio/high_max": 0.0032617136312182993, "clip_ratio/high_mean": 0.0032617136312182993, "clip_ratio/low_mean": 0.0044748389918822795, "clip_ratio/low_min": 0.0044748389918822795, "clip_ratio/region_mean": 0.00773655268130824, "completions/clipped_ratio": 0.0, "completions/max_length": 2970.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 2875.0625, "completions/mean_terminated_length": 2875.0625, "completions/min_length": 2752.0, "completions/min_terminated_length": 2752.0, "entropy": 0.028620913391932845, "epoch": 4.336000173440007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003461865009739995, "kl": 0.011638331227004528, "learning_rate": 7.462979993932686e-06, "loss": 0.0002, "num_tokens": 36526014.0, "reward": -5.951621055603027, "reward_std": 10.050958633422852, "rewards/rollout_reward_func/mean": -5.951621055603027, "rewards/rollout_reward_func/std": 10.050959587097168, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.0625, "sampling/sampling_logp_difference/mean": 0.24537460505962372, "step": 1084, "step_time": 40.13080184799037 }, { "clip_ratio/high_max": 0.0030337420175783336, "clip_ratio/high_mean": 0.0030337420175783336, "clip_ratio/low_mean": 0.004755859234137461, "clip_ratio/low_min": 0.004755859234137461, "clip_ratio/region_mean": 0.007789601222611964, "completions/clipped_ratio": 0.0, "completions/max_length": 3095.0, "completions/max_terminated_length": 3095.0, "completions/mean_length": 2802.9375, "completions/mean_terminated_length": 2802.9375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "entropy": 0.028680460527539253, "epoch": 4.340000173600007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01140031311661005, "kl": 0.012235287926159799, "learning_rate": 7.462979993921101e-06, "loss": 0.0002, "num_tokens": 36583797.0, "reward": 8.237627983093262, "reward_std": 26.780866622924805, "rewards/rollout_reward_func/mean": 8.237627983093262, "rewards/rollout_reward_func/std": 26.78087043762207, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 40.312503814697266, "sampling/sampling_logp_difference/mean": 0.24602194130420685, "step": 1085, "step_time": 39.6766505560372 }, { "clip_ratio/high_max": 0.0035342931223567575, "clip_ratio/high_mean": 0.0035342931223567575, "clip_ratio/low_mean": 0.0041387413803022355, "clip_ratio/low_min": 0.0041387413803022355, "clip_ratio/region_mean": 0.007673034502658993, "completions/clipped_ratio": 0.0, "completions/max_length": 3102.0, "completions/max_terminated_length": 3102.0, "completions/mean_length": 2938.9375, "completions/mean_terminated_length": 2938.9375, "completions/min_length": 2795.0, "completions/min_terminated_length": 2795.0, "entropy": 0.027771319961175323, "epoch": 4.344000173760007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003503934945911169, "kl": 0.011452554143033922, "learning_rate": 7.462979993909506e-06, "loss": 0.0002, "num_tokens": 36643738.0, "reward": -3.123060703277588, "reward_std": 17.348854064941406, "rewards/rollout_reward_func/mean": -3.123060703277588, "rewards/rollout_reward_func/std": 17.348854064941406, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.03125, "sampling/sampling_logp_difference/mean": 0.2444225251674652, "step": 1086, "step_time": 40.65744258399354 }, { "clip_ratio/high_max": 0.004163262201473117, "clip_ratio/high_mean": 0.004163262201473117, "clip_ratio/low_mean": 0.004617463768227026, "clip_ratio/low_min": 0.004617463768227026, "clip_ratio/region_mean": 0.008780725940596312, "completions/clipped_ratio": 0.0, "completions/max_length": 3070.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 2903.5, "completions/mean_terminated_length": 2903.5, "completions/min_length": 2729.0, "completions/min_terminated_length": 2729.0, "entropy": 0.028311162255704403, "epoch": 4.348000173920007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031323344446718693, "kl": 0.012902627466246486, "learning_rate": 7.4629799938979e-06, "loss": 0.0002, "num_tokens": 36703091.0, "reward": -5.490224361419678, "reward_std": 12.678098678588867, "rewards/rollout_reward_func/mean": -5.490224361419678, "rewards/rollout_reward_func/std": 12.678098678588867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.4375, "sampling/sampling_logp_difference/mean": 0.24683959782123566, "step": 1087, "step_time": 39.37422795101884 }, { "clip_ratio/high_max": 0.003330335021018982, "clip_ratio/high_mean": 0.003330335021018982, "clip_ratio/low_mean": 0.004944093903759494, "clip_ratio/low_min": 0.004944093903759494, "clip_ratio/region_mean": 0.008274428837466985, "completions/clipped_ratio": 0.0, "completions/max_length": 3136.0, "completions/max_terminated_length": 3136.0, "completions/mean_length": 2879.9375, "completions/mean_terminated_length": 2879.9375, "completions/min_length": 2727.0, "completions/min_terminated_length": 2727.0, "entropy": 0.02929218835197389, "epoch": 4.352000174080007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028234736528247595, "kl": 0.013427553349174559, "learning_rate": 7.462979993886281e-06, "loss": 0.0002, "num_tokens": 36762050.0, "reward": 0.7258753776550293, "reward_std": 19.605854034423828, "rewards/rollout_reward_func/mean": 0.7258753776550293, "rewards/rollout_reward_func/std": 19.605852127075195, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.3125, "sampling/sampling_logp_difference/mean": 0.2518324553966522, "step": 1088, "step_time": 39.107716809012345 }, { "clip_ratio/high_max": 0.004190720123006031, "clip_ratio/high_mean": 0.004190720123006031, "clip_ratio/low_mean": 0.003653745836345479, "clip_ratio/low_min": 0.003653745836345479, "clip_ratio/region_mean": 0.00784446601755917, "completions/clipped_ratio": 0.0, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 2933.375, "completions/mean_terminated_length": 2933.375, "completions/min_length": 2827.0, "completions/min_terminated_length": 2827.0, "entropy": 0.028225712943822145, "epoch": 4.356000174240007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038676185067743063, "kl": 0.011870147427543998, "learning_rate": 7.462979993874654e-06, "loss": 0.0002, "num_tokens": 36821869.0, "reward": -2.420806884765625, "reward_std": 10.422677993774414, "rewards/rollout_reward_func/mean": -2.420806884765625, "rewards/rollout_reward_func/std": 10.422677993774414, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.800785064697266, "sampling/sampling_logp_difference/mean": 0.2466847449541092, "step": 1089, "step_time": 39.40140445699217 }, { "clip_ratio/high_max": 0.003513655683491379, "clip_ratio/high_mean": 0.003513655683491379, "clip_ratio/low_mean": 0.004057664686115459, "clip_ratio/low_min": 0.004057664686115459, "clip_ratio/region_mean": 0.007571320398710668, "completions/clipped_ratio": 0.0, "completions/max_length": 3098.0, "completions/max_terminated_length": 3098.0, "completions/mean_length": 2907.5625, "completions/mean_terminated_length": 2907.5625, "completions/min_length": 2755.0, "completions/min_terminated_length": 2755.0, "entropy": 0.029107811162248254, "epoch": 4.360000174400007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007408893667161465, "kl": 0.010941896587610245, "learning_rate": 7.462979993863013e-06, "loss": 0.0002, "num_tokens": 36881291.0, "reward": -2.669165849685669, "reward_std": 10.181453704833984, "rewards/rollout_reward_func/mean": -2.669165849685669, "rewards/rollout_reward_func/std": 10.1814546585083, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.5, "sampling/sampling_logp_difference/mean": 0.2503148019313812, "step": 1090, "step_time": 40.557560825996916 }, { "clip_ratio/high_max": 0.0033309202699456364, "clip_ratio/high_mean": 0.0033309202699456364, "clip_ratio/low_mean": 0.004054035933222622, "clip_ratio/low_min": 0.004054035933222622, "clip_ratio/region_mean": 0.007384956174064428, "completions/clipped_ratio": 0.0, "completions/max_length": 3095.0, "completions/max_terminated_length": 3095.0, "completions/mean_length": 2807.9375, "completions/mean_terminated_length": 2807.9375, "completions/min_length": 1792.0, "completions/min_terminated_length": 1792.0, "entropy": 0.02962216013111174, "epoch": 4.364000174560007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035325977951288223, "kl": 0.011203383677639067, "learning_rate": 7.462979993851363e-06, "loss": 0.0002, "num_tokens": 36939141.0, "reward": 6.679830551147461, "reward_std": 34.90888214111328, "rewards/rollout_reward_func/mean": 6.679830551147461, "rewards/rollout_reward_func/std": 34.908878326416016, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.078125, "sampling/sampling_logp_difference/mean": 0.2460545301437378, "step": 1091, "step_time": 39.32598441798473 }, { "clip_ratio/high_max": 0.004118526965612546, "clip_ratio/high_mean": 0.004118526965612546, "clip_ratio/low_mean": 0.004043967055622488, "clip_ratio/low_min": 0.004043967055622488, "clip_ratio/region_mean": 0.008162494050338864, "completions/clipped_ratio": 0.0, "completions/max_length": 3073.0, "completions/max_terminated_length": 3073.0, "completions/mean_length": 2911.9375, "completions/mean_terminated_length": 2911.9375, "completions/min_length": 2776.0, "completions/min_terminated_length": 2776.0, "entropy": 0.029251845087856054, "epoch": 4.368000174720007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.019636889919638634, "kl": 0.012677778315264732, "learning_rate": 7.462979993839702e-06, "loss": 0.0002, "num_tokens": 36998637.0, "reward": 2.977764129638672, "reward_std": 12.483434677124023, "rewards/rollout_reward_func/mean": 2.977764129638672, "rewards/rollout_reward_func/std": 12.483434677124023, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.40625, "sampling/sampling_logp_difference/mean": 0.24448145925998688, "step": 1092, "step_time": 39.92086875700625 }, { "clip_ratio/high_max": 0.003439009131398052, "clip_ratio/high_mean": 0.003439009131398052, "clip_ratio/low_mean": 0.005358494003303349, "clip_ratio/low_min": 0.005358494003303349, "clip_ratio/region_mean": 0.008797503192909062, "completions/clipped_ratio": 0.0, "completions/max_length": 2927.0, "completions/max_terminated_length": 2927.0, "completions/mean_length": 2706.8125, "completions/mean_terminated_length": 2706.8125, "completions/min_length": 1026.0, "completions/min_terminated_length": 1026.0, "entropy": 0.03292910964228213, "epoch": 4.372000174880007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00288976589217782, "kl": 0.012393912300467491, "learning_rate": 7.462979993828029e-06, "loss": 0.0002, "num_tokens": 37054822.0, "reward": 11.054157257080078, "reward_std": 29.09334373474121, "rewards/rollout_reward_func/mean": 11.054157257080078, "rewards/rollout_reward_func/std": 29.093341827392578, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.4375, "sampling/sampling_logp_difference/mean": 0.2573806345462799, "step": 1093, "step_time": 38.324903255997924 }, { "clip_ratio/high_max": 0.004320480831665918, "clip_ratio/high_mean": 0.004320480831665918, "clip_ratio/low_mean": 0.003887867438606918, "clip_ratio/low_min": 0.003887867438606918, "clip_ratio/region_mean": 0.008208348299376667, "completions/clipped_ratio": 0.0, "completions/max_length": 3054.0, "completions/max_terminated_length": 3054.0, "completions/mean_length": 2883.6875, "completions/mean_terminated_length": 2883.6875, "completions/min_length": 2518.0, "completions/min_terminated_length": 2518.0, "entropy": 0.02930310578085482, "epoch": 4.376000175040007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004422912839800119, "kl": 0.011255929828621447, "learning_rate": 7.462979993816345e-06, "loss": 0.0002, "num_tokens": 37113860.0, "reward": 6.2135009765625, "reward_std": 24.805191040039062, "rewards/rollout_reward_func/mean": 6.2135009765625, "rewards/rollout_reward_func/std": 24.805191040039062, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.609375, "sampling/sampling_logp_difference/mean": 0.25757908821105957, "step": 1094, "step_time": 39.64874695900653 }, { "clip_ratio/high_max": 0.004303380672354251, "clip_ratio/high_mean": 0.004303380672354251, "clip_ratio/low_mean": 0.003651604667538777, "clip_ratio/low_min": 0.003651604667538777, "clip_ratio/region_mean": 0.00795498542720452, "completions/clipped_ratio": 0.0, "completions/max_length": 3107.0, "completions/max_terminated_length": 3107.0, "completions/mean_length": 2896.5625, "completions/mean_terminated_length": 2896.5625, "completions/min_length": 2644.0, "completions/min_terminated_length": 2644.0, "entropy": 0.0293764544185251, "epoch": 4.380000175200007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036835046485066414, "kl": 0.011554702185094357, "learning_rate": 7.462979993804649e-06, "loss": 0.0002, "num_tokens": 37173110.0, "reward": 0.4624483585357666, "reward_std": 20.86042594909668, "rewards/rollout_reward_func/mean": 0.4624483585357666, "rewards/rollout_reward_func/std": 20.86042594909668, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 63.125, "sampling/sampling_logp_difference/mean": 0.2481311708688736, "step": 1095, "step_time": 40.42558007898333 }, { "clip_ratio/high_max": 0.0035031992301810533, "clip_ratio/high_mean": 0.0035031992301810533, "clip_ratio/low_mean": 0.004085527820279822, "clip_ratio/low_min": 0.004085527820279822, "clip_ratio/region_mean": 0.007588726992253214, "completions/clipped_ratio": 0.0, "completions/max_length": 3034.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 2905.8125, "completions/mean_terminated_length": 2905.8125, "completions/min_length": 2752.0, "completions/min_terminated_length": 2752.0, "entropy": 0.028938046656548977, "epoch": 4.384000175360007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00814035627990961, "kl": 0.013037582510150969, "learning_rate": 7.462979993792944e-06, "loss": 0.0002, "num_tokens": 37232499.0, "reward": -5.093977451324463, "reward_std": 10.76500129699707, "rewards/rollout_reward_func/mean": -5.093977451324463, "rewards/rollout_reward_func/std": 10.76500129699707, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.3125, "sampling/sampling_logp_difference/mean": 0.24011093378067017, "step": 1096, "step_time": 39.602890034992015 }, { "clip_ratio/high_max": 0.005882118770387024, "clip_ratio/high_mean": 0.005882118770387024, "clip_ratio/low_mean": 0.0024752867175266147, "clip_ratio/low_min": 0.0024752867175266147, "clip_ratio/region_mean": 0.008357405429705977, "completions/clipped_ratio": 0.0, "completions/max_length": 3082.0, "completions/max_terminated_length": 3082.0, "completions/mean_length": 2941.8125, "completions/mean_terminated_length": 2941.8125, "completions/min_length": 2796.0, "completions/min_terminated_length": 2796.0, "entropy": 0.02828919980674982, "epoch": 4.388000175520007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004467447753995657, "kl": 0.012200544006191194, "learning_rate": 7.462979993781227e-06, "loss": 0.0002, "num_tokens": 37292484.0, "reward": 1.393117904663086, "reward_std": 12.597116470336914, "rewards/rollout_reward_func/mean": 1.393117904663086, "rewards/rollout_reward_func/std": 12.59711742401123, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.578125, "sampling/sampling_logp_difference/mean": 0.2475636601448059, "step": 1097, "step_time": 40.647823697014246 }, { "clip_ratio/high_max": 0.0026607322797644883, "clip_ratio/high_mean": 0.0026607322797644883, "clip_ratio/low_mean": 0.005175609112484381, "clip_ratio/low_min": 0.005175609112484381, "clip_ratio/region_mean": 0.007836341392248869, "completions/clipped_ratio": 0.0, "completions/max_length": 3118.0, "completions/max_terminated_length": 3118.0, "completions/mean_length": 2893.0625, "completions/mean_terminated_length": 2893.0625, "completions/min_length": 2629.0, "completions/min_terminated_length": 2629.0, "entropy": 0.030480330577120185, "epoch": 4.392000175680007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006829818245023489, "kl": 0.012980204890482128, "learning_rate": 7.462979993769498e-06, "loss": 0.0002, "num_tokens": 37351666.0, "reward": 4.347774028778076, "reward_std": 33.932865142822266, "rewards/rollout_reward_func/mean": 4.347774028778076, "rewards/rollout_reward_func/std": 33.932865142822266, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.375, "sampling/sampling_logp_difference/mean": 0.26105010509490967, "step": 1098, "step_time": 40.78453063299821 }, { "clip_ratio/high_max": 0.002900014806073159, "clip_ratio/high_mean": 0.002900014806073159, "clip_ratio/low_mean": 0.00482179771643132, "clip_ratio/low_min": 0.00482179771643132, "clip_ratio/region_mean": 0.007721812464296818, "completions/clipped_ratio": 0.0, "completions/max_length": 3128.0, "completions/max_terminated_length": 3128.0, "completions/mean_length": 2917.3125, "completions/mean_terminated_length": 2917.3125, "completions/min_length": 2708.0, "completions/min_terminated_length": 2708.0, "entropy": 0.028844565153121948, "epoch": 4.396000175840007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003469482995569706, "kl": 0.01146037655416876, "learning_rate": 7.46297999375776e-06, "loss": 0.0002, "num_tokens": 37411233.0, "reward": -0.44899630546569824, "reward_std": 14.086431503295898, "rewards/rollout_reward_func/mean": -0.44899630546569824, "rewards/rollout_reward_func/std": 14.086431503295898, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.109375, "sampling/sampling_logp_difference/mean": 0.24760007858276367, "step": 1099, "step_time": 40.62156513800437 }, { "clip_ratio/high_max": 0.0034983555087819695, "clip_ratio/high_mean": 0.0034983555087819695, "clip_ratio/low_mean": 0.004699563258327544, "clip_ratio/low_min": 0.004699563258327544, "clip_ratio/region_mean": 0.008197918650694191, "completions/clipped_ratio": 0.0, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 2824.75, "completions/mean_terminated_length": 2824.75, "completions/min_length": 2224.0, "completions/min_terminated_length": 2224.0, "entropy": 0.03235102281905711, "epoch": 4.400000176000007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006465403828769922, "kl": 0.012669760501012206, "learning_rate": 7.46297999374601e-06, "loss": 0.0002, "num_tokens": 37469322.0, "reward": 2.748781204223633, "reward_std": 23.909435272216797, "rewards/rollout_reward_func/mean": 2.748781204223633, "rewards/rollout_reward_func/std": 23.909435272216797, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.375003814697266, "sampling/sampling_logp_difference/mean": 0.25004154443740845, "step": 1100, "step_time": 39.26569430298696 }, { "clip_ratio/high_max": 0.0046065250935498625, "clip_ratio/high_mean": 0.0046065250935498625, "clip_ratio/low_mean": 0.003812256793025881, "clip_ratio/low_min": 0.003812256793025881, "clip_ratio/region_mean": 0.008418781973887235, "completions/clipped_ratio": 0.0, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 2941.5, "completions/mean_terminated_length": 2941.5, "completions/min_length": 2856.0, "completions/min_terminated_length": 2856.0, "entropy": 0.02857138798572123, "epoch": 4.4040001761600074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.015406576916575432, "kl": 0.015561993233859539, "learning_rate": 7.462979993734249e-06, "loss": 0.0002, "num_tokens": 37529304.0, "reward": -2.8924083709716797, "reward_std": 13.600421905517578, "rewards/rollout_reward_func/mean": -2.8924083709716797, "rewards/rollout_reward_func/std": 13.600422859191895, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.6484375, "sampling/sampling_logp_difference/mean": 0.2535540461540222, "step": 1101, "step_time": 40.44291890699242 }, { "clip_ratio/high_max": 0.00328907766379416, "clip_ratio/high_mean": 0.00328907766379416, "clip_ratio/low_mean": 0.0042211007967125624, "clip_ratio/low_min": 0.0042211007967125624, "clip_ratio/region_mean": 0.007510178489610553, "completions/clipped_ratio": 0.0, "completions/max_length": 3114.0, "completions/max_terminated_length": 3114.0, "completions/mean_length": 2934.75, "completions/mean_terminated_length": 2934.75, "completions/min_length": 2767.0, "completions/min_terminated_length": 2767.0, "entropy": 0.029030583100393414, "epoch": 4.408000176320007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0066563962027430534, "kl": 0.010464978695381433, "learning_rate": 7.462979993722477e-06, "loss": 0.0001, "num_tokens": 37589185.0, "reward": 1.3392187356948853, "reward_std": 15.058025360107422, "rewards/rollout_reward_func/mean": 1.3392187356948853, "rewards/rollout_reward_func/std": 15.058027267456055, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.391326904296875, "sampling/sampling_logp_difference/mean": 0.2436663806438446, "step": 1102, "step_time": 40.791131265010335 }, { "clip_ratio/high_max": 0.004693172610132024, "clip_ratio/high_mean": 0.004693172610132024, "clip_ratio/low_mean": 0.0031790912616997957, "clip_ratio/low_min": 0.0031790912616997957, "clip_ratio/region_mean": 0.007872263784520328, "completions/clipped_ratio": 0.0, "completions/max_length": 3143.0, "completions/max_terminated_length": 3143.0, "completions/mean_length": 3023.4375, "completions/mean_terminated_length": 3023.4375, "completions/min_length": 2915.0, "completions/min_terminated_length": 2915.0, "entropy": 0.02773354691453278, "epoch": 4.412000176480007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007027472835034132, "kl": 0.012156201875768602, "learning_rate": 7.462979993710693e-06, "loss": 0.0002, "num_tokens": 37650515.0, "reward": -3.380072593688965, "reward_std": 14.791006088256836, "rewards/rollout_reward_func/mean": -3.380072593688965, "rewards/rollout_reward_func/std": 14.791006088256836, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.4453125, "sampling/sampling_logp_difference/mean": 0.23886288702487946, "step": 1103, "step_time": 40.25755926598504 }, { "clip_ratio/high_max": 0.00415258583961986, "clip_ratio/high_mean": 0.00415258583961986, "clip_ratio/low_mean": 0.0036826881987508386, "clip_ratio/low_min": 0.0036826881987508386, "clip_ratio/region_mean": 0.007835274038370699, "completions/clipped_ratio": 0.0, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 2866.375, "completions/mean_terminated_length": 2866.375, "completions/min_length": 2754.0, "completions/min_terminated_length": 2754.0, "entropy": 0.02934964490123093, "epoch": 4.416000176640007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006357796955853701, "kl": 0.011504888825584203, "learning_rate": 7.462979993698899e-06, "loss": 0.0002, "num_tokens": 37709258.0, "reward": -5.785195827484131, "reward_std": 11.204742431640625, "rewards/rollout_reward_func/mean": -5.785195827484131, "rewards/rollout_reward_func/std": 11.204743385314941, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.8125114440918, "sampling/sampling_logp_difference/mean": 0.24745525419712067, "step": 1104, "step_time": 39.500821576992166 }, { "clip_ratio/high_max": 0.003437012346694246, "clip_ratio/high_mean": 0.003437012346694246, "clip_ratio/low_mean": 0.004039636085508391, "clip_ratio/low_min": 0.004039636085508391, "clip_ratio/region_mean": 0.007476648432202637, "completions/clipped_ratio": 0.0, "completions/max_length": 3116.0, "completions/max_terminated_length": 3116.0, "completions/mean_length": 2921.0, "completions/mean_terminated_length": 2921.0, "completions/min_length": 2205.0, "completions/min_terminated_length": 2205.0, "entropy": 0.02876848983578384, "epoch": 4.420000176800007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009179907850921154, "kl": 0.01202740368898958, "learning_rate": 7.462979993687093e-06, "loss": 0.0002, "num_tokens": 37768928.0, "reward": 2.571559190750122, "reward_std": 21.797767639160156, "rewards/rollout_reward_func/mean": 2.571559190750122, "rewards/rollout_reward_func/std": 21.79776954650879, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.421875, "sampling/sampling_logp_difference/mean": 0.2432270348072052, "step": 1105, "step_time": 40.295812332988135 }, { "clip_ratio/high_max": 0.0032923344697337598, "clip_ratio/high_mean": 0.0032923344697337598, "clip_ratio/low_mean": 0.00474744857638143, "clip_ratio/low_min": 0.00474744857638143, "clip_ratio/region_mean": 0.00803978304611519, "completions/clipped_ratio": 0.0, "completions/max_length": 3136.0, "completions/max_terminated_length": 3136.0, "completions/mean_length": 2940.0625, "completions/mean_terminated_length": 2940.0625, "completions/min_length": 2685.0, "completions/min_terminated_length": 2685.0, "entropy": 0.028772678459063172, "epoch": 4.424000176960007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002748209750279784, "kl": 0.011439422611147165, "learning_rate": 7.462979993675277e-06, "loss": 0.0002, "num_tokens": 37828885.0, "reward": 1.2588368654251099, "reward_std": 12.783369064331055, "rewards/rollout_reward_func/mean": 1.2588368654251099, "rewards/rollout_reward_func/std": 12.783368110656738, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 62.923828125, "sampling/sampling_logp_difference/mean": 0.2441830039024353, "step": 1106, "step_time": 39.97024996901746 }, { "clip_ratio/high_max": 0.0027994906122330576, "clip_ratio/high_mean": 0.0027994906122330576, "clip_ratio/low_mean": 0.004272397345630452, "clip_ratio/low_min": 0.004272397345630452, "clip_ratio/region_mean": 0.00707188795786351, "completions/clipped_ratio": 0.0, "completions/max_length": 3102.0, "completions/max_terminated_length": 3102.0, "completions/mean_length": 2607.75, "completions/mean_terminated_length": 2607.75, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.03043179912492633, "epoch": 4.428000177120007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038898936472833157, "kl": 0.010081631713546813, "learning_rate": 7.46297999366345e-06, "loss": 0.0001, "num_tokens": 37883511.0, "reward": 6.122200012207031, "reward_std": 31.402706146240234, "rewards/rollout_reward_func/mean": 6.122200012207031, "rewards/rollout_reward_func/std": 31.402708053588867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.5625, "sampling/sampling_logp_difference/mean": 0.24788883328437805, "step": 1107, "step_time": 38.981020710009034 }, { "clip_ratio/high_max": 0.0037570920248981565, "clip_ratio/high_mean": 0.0037570920248981565, "clip_ratio/low_mean": 0.004551890131551772, "clip_ratio/low_min": 0.004551890131551772, "clip_ratio/region_mean": 0.008308982069138438, "completions/clipped_ratio": 0.0, "completions/max_length": 3062.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 2849.4375, "completions/mean_terminated_length": 2849.4375, "completions/min_length": 1629.0, "completions/min_terminated_length": 1629.0, "entropy": 0.031195292249321938, "epoch": 4.432000177280007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004811461083590984, "kl": 0.011151484854053706, "learning_rate": 7.462979993651611e-06, "loss": 0.0002, "num_tokens": 37942018.0, "reward": 4.9340009689331055, "reward_std": 37.57636260986328, "rewards/rollout_reward_func/mean": 4.9340009689331055, "rewards/rollout_reward_func/std": 37.57636642456055, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.8125, "sampling/sampling_logp_difference/mean": 0.24593082070350647, "step": 1108, "step_time": 39.47985723496822 }, { "clip_ratio/high_max": 0.0034652262402232736, "clip_ratio/high_mean": 0.0034652262402232736, "clip_ratio/low_mean": 0.004929480754071847, "clip_ratio/low_min": 0.004929480754071847, "clip_ratio/region_mean": 0.008394706877879798, "completions/clipped_ratio": 0.0, "completions/max_length": 3048.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 2911.5, "completions/mean_terminated_length": 2911.5, "completions/min_length": 2783.0, "completions/min_terminated_length": 2783.0, "entropy": 0.029599650064483285, "epoch": 4.4360001774400074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007676440291106701, "kl": 0.012464813305996358, "learning_rate": 7.462979993639761e-06, "loss": 0.0002, "num_tokens": 38001486.0, "reward": -3.566612720489502, "reward_std": 13.195479393005371, "rewards/rollout_reward_func/mean": -3.566612720489502, "rewards/rollout_reward_func/std": 13.195479393005371, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 43.25, "sampling/sampling_logp_difference/mean": 0.24514290690422058, "step": 1109, "step_time": 39.555507067008875 }, { "clip_ratio/high_max": 0.0034081918420270085, "clip_ratio/high_mean": 0.0034081918420270085, "clip_ratio/low_mean": 0.0040036946011241525, "clip_ratio/low_min": 0.0040036946011241525, "clip_ratio/region_mean": 0.0074118864722549915, "completions/clipped_ratio": 0.0, "completions/max_length": 3112.0, "completions/max_terminated_length": 3112.0, "completions/mean_length": 2866.8125, "completions/mean_terminated_length": 2866.8125, "completions/min_length": 2754.0, "completions/min_terminated_length": 2754.0, "entropy": 0.030140854185447097, "epoch": 4.440000177600007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004796236287802458, "kl": 0.012339538603555411, "learning_rate": 7.4629799936279006e-06, "loss": 0.0002, "num_tokens": 38060238.0, "reward": -4.288240432739258, "reward_std": 12.726479530334473, "rewards/rollout_reward_func/mean": -4.288240432739258, "rewards/rollout_reward_func/std": 12.726479530334473, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.59375, "sampling/sampling_logp_difference/mean": 0.24923771619796753, "step": 1110, "step_time": 40.74631712400878 }, { "clip_ratio/high_max": 0.003911213454557583, "clip_ratio/high_mean": 0.003911213454557583, "clip_ratio/low_mean": 0.0035257424460723996, "clip_ratio/low_min": 0.0035257424460723996, "clip_ratio/region_mean": 0.007436956046149135, "completions/clipped_ratio": 0.0, "completions/max_length": 3119.0, "completions/max_terminated_length": 3119.0, "completions/mean_length": 2978.5, "completions/mean_terminated_length": 2978.5, "completions/min_length": 2796.0, "completions/min_terminated_length": 2796.0, "entropy": 0.02790221362374723, "epoch": 4.444000177760007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004058662336319685, "kl": 0.011332998517900705, "learning_rate": 7.462979993616029e-06, "loss": 0.0002, "num_tokens": 38120822.0, "reward": -3.4651145935058594, "reward_std": 16.240079879760742, "rewards/rollout_reward_func/mean": -3.4651145935058594, "rewards/rollout_reward_func/std": 16.240079879760742, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.671875, "sampling/sampling_logp_difference/mean": 0.2407272458076477, "step": 1111, "step_time": 40.67204329301603 }, { "clip_ratio/high_max": 0.0024771432508714497, "clip_ratio/high_mean": 0.0024771432508714497, "clip_ratio/low_mean": 0.005076844070572406, "clip_ratio/low_min": 0.005076844070572406, "clip_ratio/region_mean": 0.007553987321443856, "completions/clipped_ratio": 0.0, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 2914.75, "completions/mean_terminated_length": 2914.75, "completions/min_length": 2694.0, "completions/min_terminated_length": 2694.0, "entropy": 0.030057216994464397, "epoch": 4.448000177920007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0045284489169716835, "kl": 0.010591791127808392, "learning_rate": 7.462979993604146e-06, "loss": 0.0002, "num_tokens": 38180372.0, "reward": -2.262521982192993, "reward_std": 12.019917488098145, "rewards/rollout_reward_func/mean": -2.262521982192993, "rewards/rollout_reward_func/std": 12.019917488098145, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.985313415527344, "sampling/sampling_logp_difference/mean": 0.24736960232257843, "step": 1112, "step_time": 39.39432826801203 }, { "clip_ratio/high_max": 0.003118035805528052, "clip_ratio/high_mean": 0.003118035805528052, "clip_ratio/low_mean": 0.005223031970672309, "clip_ratio/low_min": 0.005223031970672309, "clip_ratio/region_mean": 0.008341067819856107, "completions/clipped_ratio": 0.0, "completions/max_length": 3091.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 2756.3125, "completions/mean_terminated_length": 2756.3125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.02958964742720127, "epoch": 4.452000178080007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003309434512630105, "kl": 0.01085611735470593, "learning_rate": 7.462979993592253e-06, "loss": 0.0001, "num_tokens": 38237413.0, "reward": 7.875925064086914, "reward_std": 30.419748306274414, "rewards/rollout_reward_func/mean": 7.875925064086914, "rewards/rollout_reward_func/std": 30.419748306274414, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.22265625, "sampling/sampling_logp_difference/mean": 0.2553911507129669, "step": 1113, "step_time": 39.65218619799998 }, { "clip_ratio/high_max": 0.002579641994088888, "clip_ratio/high_mean": 0.002579641994088888, "clip_ratio/low_mean": 0.004319337545894086, "clip_ratio/low_min": 0.004319337545894086, "clip_ratio/region_mean": 0.0068989795981906354, "completions/clipped_ratio": 0.0, "completions/max_length": 3124.0, "completions/max_terminated_length": 3124.0, "completions/mean_length": 2799.5625, "completions/mean_terminated_length": 2799.5625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.029085973743349314, "epoch": 4.4560001782400074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004720176570117474, "kl": 0.010525563557166606, "learning_rate": 7.462979993580348e-06, "loss": 0.0001, "num_tokens": 38295131.0, "reward": 2.643146514892578, "reward_std": 27.07338523864746, "rewards/rollout_reward_func/mean": 2.643146514892578, "rewards/rollout_reward_func/std": 27.07338523864746, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.125, "sampling/sampling_logp_difference/mean": 0.23912928998470306, "step": 1114, "step_time": 39.86074367001129 }, { "clip_ratio/high_max": 0.004876540508121252, "clip_ratio/high_mean": 0.004876540508121252, "clip_ratio/low_mean": 0.003740018350072205, "clip_ratio/low_min": 0.003740018350072205, "clip_ratio/region_mean": 0.008616558916401118, "completions/clipped_ratio": 0.0, "completions/max_length": 3128.0, "completions/max_terminated_length": 3128.0, "completions/mean_length": 3031.3125, "completions/mean_terminated_length": 3031.3125, "completions/min_length": 2837.0, "completions/min_terminated_length": 2837.0, "entropy": 0.02792055602185428, "epoch": 4.460000178400007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.09420734643936157, "kl": 0.025389847462065518, "learning_rate": 7.462979993568432e-06, "loss": 0.0004, "num_tokens": 38356592.0, "reward": -2.6627392768859863, "reward_std": 12.342949867248535, "rewards/rollout_reward_func/mean": -2.6627392768859863, "rewards/rollout_reward_func/std": 12.342949867248535, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.494140625, "sampling/sampling_logp_difference/mean": 0.2417525053024292, "step": 1115, "step_time": 40.864196582027944 }, { "clip_ratio/high_max": 0.003606547244999092, "clip_ratio/high_mean": 0.003606547244999092, "clip_ratio/low_mean": 0.004350950010120869, "clip_ratio/low_min": 0.004350950010120869, "clip_ratio/region_mean": 0.007957497145980597, "completions/clipped_ratio": 0.0, "completions/max_length": 3108.0, "completions/max_terminated_length": 3108.0, "completions/mean_length": 2611.0, "completions/mean_terminated_length": 2611.0, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 0.02938423748128116, "epoch": 4.464000178560007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0025321433786302805, "kl": 0.01080144796287641, "learning_rate": 7.462979993556506e-06, "loss": 0.0001, "num_tokens": 38411267.0, "reward": 10.839883804321289, "reward_std": 33.976932525634766, "rewards/rollout_reward_func/mean": 10.839883804321289, "rewards/rollout_reward_func/std": 33.97693634033203, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.718910217285156, "sampling/sampling_logp_difference/mean": 0.24207399785518646, "step": 1116, "step_time": 38.96819839099771 }, { "clip_ratio/high_max": 0.004048216535011306, "clip_ratio/high_mean": 0.004048216535011306, "clip_ratio/low_mean": 0.0031830061634536833, "clip_ratio/low_min": 0.0031830061634536833, "clip_ratio/region_mean": 0.00723122269846499, "completions/clipped_ratio": 0.0, "completions/max_length": 3098.0, "completions/max_terminated_length": 3098.0, "completions/mean_length": 2941.75, "completions/mean_terminated_length": 2941.75, "completions/min_length": 2779.0, "completions/min_terminated_length": 2779.0, "entropy": 0.02910278062336147, "epoch": 4.468000178720007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.06420553475618362, "kl": 0.010998606565408409, "learning_rate": 7.462979993544568e-06, "loss": 0.0002, "num_tokens": 38471251.0, "reward": -3.498727321624756, "reward_std": 12.034724235534668, "rewards/rollout_reward_func/mean": -3.498727321624756, "rewards/rollout_reward_func/std": 12.034725189208984, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.9375, "sampling/sampling_logp_difference/mean": 0.24821676313877106, "step": 1117, "step_time": 40.552096221013926 }, { "clip_ratio/high_max": 0.002650359907420352, "clip_ratio/high_mean": 0.002650359907420352, "clip_ratio/low_mean": 0.005552972463192418, "clip_ratio/low_min": 0.005552972463192418, "clip_ratio/region_mean": 0.008203332545235753, "completions/clipped_ratio": 0.0, "completions/max_length": 3152.0, "completions/max_terminated_length": 3152.0, "completions/mean_length": 2878.9375, "completions/mean_terminated_length": 2878.9375, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "entropy": 0.028519095620140433, "epoch": 4.472000178880007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009919241070747375, "kl": 0.013030442758463323, "learning_rate": 7.462979993532619e-06, "loss": 0.0002, "num_tokens": 38530254.0, "reward": 2.748319625854492, "reward_std": 23.989660263061523, "rewards/rollout_reward_func/mean": 2.748319625854492, "rewards/rollout_reward_func/std": 23.989660263061523, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.0, "sampling/sampling_logp_difference/mean": 0.2406451553106308, "step": 1118, "step_time": 39.61376456700964 }, { "clip_ratio/high_max": 0.004536719527095556, "clip_ratio/high_mean": 0.004536719527095556, "clip_ratio/low_mean": 0.0036144684127066284, "clip_ratio/low_min": 0.0036144684127066284, "clip_ratio/region_mean": 0.008151187968906015, "completions/clipped_ratio": 0.0, "completions/max_length": 3117.0, "completions/max_terminated_length": 3117.0, "completions/mean_length": 2955.1875, "completions/mean_terminated_length": 2955.1875, "completions/min_length": 2272.0, "completions/min_terminated_length": 2272.0, "entropy": 0.02978577883914113, "epoch": 4.4760001790400075e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004525630734860897, "kl": 0.011053259833715856, "learning_rate": 7.462979993520659e-06, "loss": 0.0002, "num_tokens": 38590477.0, "reward": 4.230072975158691, "reward_std": 38.1122932434082, "rewards/rollout_reward_func/mean": 4.230072975158691, "rewards/rollout_reward_func/std": 38.11229705810547, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.65625, "sampling/sampling_logp_difference/mean": 0.24907691776752472, "step": 1119, "step_time": 40.433959207017324 }, { "clip_ratio/high_max": 0.004147130064666271, "clip_ratio/high_mean": 0.004147130064666271, "clip_ratio/low_mean": 0.004136820673011243, "clip_ratio/low_min": 0.004136820673011243, "clip_ratio/region_mean": 0.008283950795885175, "completions/clipped_ratio": 0.0, "completions/max_length": 2973.0, "completions/max_terminated_length": 2973.0, "completions/mean_length": 2845.3125, "completions/mean_terminated_length": 2845.3125, "completions/min_length": 2609.0, "completions/min_terminated_length": 2609.0, "entropy": 0.030206482158973813, "epoch": 4.480000179200007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009565039537847042, "kl": 0.013404507888481021, "learning_rate": 7.4629799935086874e-06, "loss": 0.0002, "num_tokens": 38648888.0, "reward": 10.59997844696045, "reward_std": 36.29201889038086, "rewards/rollout_reward_func/mean": 10.59997844696045, "rewards/rollout_reward_func/std": 36.29201889038086, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.2822265625, "sampling/sampling_logp_difference/mean": 0.2545912265777588, "step": 1120, "step_time": 39.70877265397576 }, { "clip_ratio/high_max": 0.003479677892755717, "clip_ratio/high_mean": 0.003479677892755717, "clip_ratio/low_mean": 0.004104088438907638, "clip_ratio/low_min": 0.004104088438907638, "clip_ratio/region_mean": 0.007583766360767186, "completions/clipped_ratio": 0.0, "completions/max_length": 3130.0, "completions/max_terminated_length": 3130.0, "completions/mean_length": 2996.625, "completions/mean_terminated_length": 2996.625, "completions/min_length": 2855.0, "completions/min_terminated_length": 2855.0, "entropy": 0.027770803542807698, "epoch": 4.484000179360007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006031378172338009, "kl": 0.010546526696998626, "learning_rate": 7.462979993496705e-06, "loss": 0.0002, "num_tokens": 38709773.0, "reward": -6.945809364318848, "reward_std": 9.347344398498535, "rewards/rollout_reward_func/mean": -6.945809364318848, "rewards/rollout_reward_func/std": 9.347345352172852, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.826171875, "sampling/sampling_logp_difference/mean": 0.24587830901145935, "step": 1121, "step_time": 40.73746614897391 }, { "clip_ratio/high_max": 0.003471270902082324, "clip_ratio/high_mean": 0.003471270902082324, "clip_ratio/low_mean": 0.004574832622893155, "clip_ratio/low_min": 0.004574832622893155, "clip_ratio/region_mean": 0.008046103524975479, "completions/clipped_ratio": 0.0, "completions/max_length": 3125.0, "completions/max_terminated_length": 3125.0, "completions/mean_length": 2734.4375, "completions/mean_terminated_length": 2734.4375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.02934559155255556, "epoch": 4.4880001795200074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01012823823839426, "kl": 0.014738532365299761, "learning_rate": 7.462979993484713e-06, "loss": 0.0002, "num_tokens": 38766424.0, "reward": 6.811195373535156, "reward_std": 26.54619598388672, "rewards/rollout_reward_func/mean": 6.811195373535156, "rewards/rollout_reward_func/std": 26.54619789123535, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.8125, "sampling/sampling_logp_difference/mean": 0.2562222182750702, "step": 1122, "step_time": 39.618161594029516 }, { "clip_ratio/high_max": 0.002921727253124118, "clip_ratio/high_mean": 0.002921727253124118, "clip_ratio/low_mean": 0.00428974418900907, "clip_ratio/low_min": 0.00428974418900907, "clip_ratio/region_mean": 0.007211471500340849, "completions/clipped_ratio": 0.0, "completions/max_length": 2954.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 2788.4375, "completions/mean_terminated_length": 2788.4375, "completions/min_length": 2011.0, "completions/min_terminated_length": 2011.0, "entropy": 0.03046882268972695, "epoch": 4.492000179680007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01451784372329712, "kl": 0.012610023375600576, "learning_rate": 7.462979993472708e-06, "loss": 0.0002, "num_tokens": 38823914.0, "reward": 6.456388473510742, "reward_std": 22.822690963745117, "rewards/rollout_reward_func/mean": 6.456388473510742, "rewards/rollout_reward_func/std": 22.822690963745117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 41.12521743774414, "sampling/sampling_logp_difference/mean": 0.2562422752380371, "step": 1123, "step_time": 39.41898223600583 }, { "clip_ratio/high_max": 0.0035885782563127577, "clip_ratio/high_mean": 0.0035885782563127577, "clip_ratio/low_mean": 0.00450899446150288, "clip_ratio/low_min": 0.00450899446150288, "clip_ratio/region_mean": 0.008097572717815638, "completions/clipped_ratio": 0.0, "completions/max_length": 3097.0, "completions/max_terminated_length": 3097.0, "completions/mean_length": 2840.1875, "completions/mean_terminated_length": 2840.1875, "completions/min_length": 2070.0, "completions/min_terminated_length": 2070.0, "entropy": 0.029825575649738312, "epoch": 4.496000179840007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0053435941226780415, "kl": 0.012011007638648152, "learning_rate": 7.462979993460693e-06, "loss": 0.0002, "num_tokens": 38882261.0, "reward": 9.523713111877441, "reward_std": 22.988086700439453, "rewards/rollout_reward_func/mean": 9.523713111877441, "rewards/rollout_reward_func/std": 22.988086700439453, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.71875, "sampling/sampling_logp_difference/mean": 0.2533547878265381, "step": 1124, "step_time": 39.912803417028044 }, { "clip_ratio/high_max": 0.0037927180528640747, "clip_ratio/high_mean": 0.0037927180528640747, "clip_ratio/low_mean": 0.003770877985516563, "clip_ratio/low_min": 0.003770877985516563, "clip_ratio/region_mean": 0.007563596009276807, "completions/clipped_ratio": 0.0, "completions/max_length": 3135.0, "completions/max_terminated_length": 3135.0, "completions/mean_length": 2935.375, "completions/mean_terminated_length": 2935.375, "completions/min_length": 2669.0, "completions/min_terminated_length": 2669.0, "entropy": 0.029066693037748337, "epoch": 4.500000180000007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005905323661863804, "kl": 0.011580145452171564, "learning_rate": 7.462979993448667e-06, "loss": 0.0002, "num_tokens": 38942144.0, "reward": 7.988120079040527, "reward_std": 31.207700729370117, "rewards/rollout_reward_func/mean": 7.988120079040527, "rewards/rollout_reward_func/std": 31.207700729370117, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.78125, "sampling/sampling_logp_difference/mean": 0.24487370252609253, "step": 1125, "step_time": 40.49337315300363 }, { "clip_ratio/high_max": 0.004466904589207843, "clip_ratio/high_mean": 0.004466904589207843, "clip_ratio/low_mean": 0.00332331596291624, "clip_ratio/low_min": 0.00332331596291624, "clip_ratio/region_mean": 0.007790220610331744, "completions/clipped_ratio": 0.0, "completions/max_length": 3091.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 2943.0625, "completions/mean_terminated_length": 2943.0625, "completions/min_length": 2742.0, "completions/min_terminated_length": 2742.0, "entropy": 0.0291409392375499, "epoch": 4.504000180160007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004810179118067026, "kl": 0.014217922347597778, "learning_rate": 7.4629799934366295e-06, "loss": 0.0002, "num_tokens": 39002138.0, "reward": -3.716878890991211, "reward_std": 15.992414474487305, "rewards/rollout_reward_func/mean": -3.716878890991211, "rewards/rollout_reward_func/std": 15.992415428161621, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.46875, "sampling/sampling_logp_difference/mean": 0.24862684309482574, "step": 1126, "step_time": 40.34245525699225 }, { "clip_ratio/high_max": 0.003967901808209717, "clip_ratio/high_mean": 0.003967901808209717, "clip_ratio/low_mean": 0.004800987051567063, "clip_ratio/low_min": 0.004800987051567063, "clip_ratio/region_mean": 0.00876888883067295, "completions/clipped_ratio": 0.0, "completions/max_length": 3012.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 2786.3125, "completions/mean_terminated_length": 2786.3125, "completions/min_length": 1537.0, "completions/min_terminated_length": 1537.0, "entropy": 0.032398765441030264, "epoch": 4.5080001803200075e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.005490829702466726, "kl": 0.01325589488260448, "learning_rate": 7.4629799934245805e-06, "loss": 0.0002, "num_tokens": 39059606.0, "reward": -2.2369613647460938, "reward_std": 31.85081672668457, "rewards/rollout_reward_func/mean": -2.2369613647460938, "rewards/rollout_reward_func/std": 31.850818634033203, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.03125762939453, "sampling/sampling_logp_difference/mean": 0.25795525312423706, "step": 1127, "step_time": 38.95008507100283 }, { "clip_ratio/high_max": 0.003748721501324326, "clip_ratio/high_mean": 0.003748721501324326, "clip_ratio/low_mean": 0.003466326539637521, "clip_ratio/low_min": 0.003466326539637521, "clip_ratio/region_mean": 0.007215047953650355, "completions/clipped_ratio": 0.0, "completions/max_length": 2982.0, "completions/max_terminated_length": 2982.0, "completions/mean_length": 2859.1875, "completions/mean_terminated_length": 2859.1875, "completions/min_length": 2766.0, "completions/min_terminated_length": 2766.0, "entropy": 0.03065492701716721, "epoch": 4.512000180480007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003281892044469714, "kl": 0.012385132256895304, "learning_rate": 7.46297999341252e-06, "loss": 0.0002, "num_tokens": 39118234.0, "reward": -4.958804607391357, "reward_std": 11.839067459106445, "rewards/rollout_reward_func/mean": -4.958804607391357, "rewards/rollout_reward_func/std": 11.839067459106445, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.0625, "sampling/sampling_logp_difference/mean": 0.2552793323993683, "step": 1128, "step_time": 39.93150436297583 }, { "clip_ratio/high_max": 0.0036494527594186366, "clip_ratio/high_mean": 0.0036494527594186366, "clip_ratio/low_mean": 0.003790512098930776, "clip_ratio/low_min": 0.003790512098930776, "clip_ratio/region_mean": 0.007439964858349413, "completions/clipped_ratio": 0.0, "completions/max_length": 3097.0, "completions/max_terminated_length": 3097.0, "completions/mean_length": 2714.8125, "completions/mean_terminated_length": 2714.8125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.030306935543194413, "epoch": 4.516000180640007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0049546766094863415, "kl": 0.011558804893866181, "learning_rate": 7.46297999340045e-06, "loss": 0.0002, "num_tokens": 39174566.0, "reward": 1.4617977142333984, "reward_std": 25.604408264160156, "rewards/rollout_reward_func/mean": 1.4617977142333984, "rewards/rollout_reward_func/std": 25.604408264160156, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.31250762939453, "sampling/sampling_logp_difference/mean": 0.24826401472091675, "step": 1129, "step_time": 39.80885223999212 }, { "clip_ratio/high_max": 0.0045142283488530666, "clip_ratio/high_mean": 0.0045142283488530666, "clip_ratio/low_mean": 0.004436999501194805, "clip_ratio/low_min": 0.004436999501194805, "clip_ratio/region_mean": 0.008951227820944041, "completions/clipped_ratio": 0.0, "completions/max_length": 2993.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 2862.0625, "completions/mean_terminated_length": 2862.0625, "completions/min_length": 2749.0, "completions/min_terminated_length": 2749.0, "entropy": 0.029911794001236558, "epoch": 4.5200001808000074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0124532300978899, "kl": 0.01308522664476186, "learning_rate": 7.462979993388369e-06, "loss": 0.0002, "num_tokens": 39233246.0, "reward": -3.198392391204834, "reward_std": 11.42626667022705, "rewards/rollout_reward_func/mean": -3.198392391204834, "rewards/rollout_reward_func/std": 11.42626667022705, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.5625, "sampling/sampling_logp_difference/mean": 0.2530980110168457, "step": 1130, "step_time": 40.05758309000521 }, { "clip_ratio/high_max": 0.0031896930740913376, "clip_ratio/high_mean": 0.0031896930740913376, "clip_ratio/low_mean": 0.004356575023848563, "clip_ratio/low_min": 0.004356575023848563, "clip_ratio/region_mean": 0.007546268170699477, "completions/clipped_ratio": 0.0, "completions/max_length": 2974.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 2730.9375, "completions/mean_terminated_length": 2730.9375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.030174885876476765, "epoch": 4.524000180960007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0032514322083443403, "kl": 0.010535846464335918, "learning_rate": 7.462979993376277e-06, "loss": 0.0001, "num_tokens": 39289845.0, "reward": -0.056294918060302734, "reward_std": 27.401514053344727, "rewards/rollout_reward_func/mean": -0.056294918060302734, "rewards/rollout_reward_func/std": 27.401514053344727, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.203125, "sampling/sampling_logp_difference/mean": 0.2462097406387329, "step": 1131, "step_time": 39.35603146199719 }, { "clip_ratio/high_max": 0.004481931449845433, "clip_ratio/high_mean": 0.004481931449845433, "clip_ratio/low_mean": 0.0026147615863010287, "clip_ratio/low_min": 0.0026147615863010287, "clip_ratio/region_mean": 0.007096693094354123, "completions/clipped_ratio": 0.0, "completions/max_length": 3116.0, "completions/max_terminated_length": 3116.0, "completions/mean_length": 3002.125, "completions/mean_terminated_length": 3002.125, "completions/min_length": 2756.0, "completions/min_terminated_length": 2756.0, "entropy": 0.02737592370249331, "epoch": 4.5280001811200075e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004373793490231037, "kl": 0.010386579786427319, "learning_rate": 7.462979993364173e-06, "loss": 0.0002, "num_tokens": 39350819.0, "reward": -8.303312301635742, "reward_std": 8.868890762329102, "rewards/rollout_reward_func/mean": -8.303312301635742, "rewards/rollout_reward_func/std": 8.868891716003418, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.375, "sampling/sampling_logp_difference/mean": 0.2381032556295395, "step": 1132, "step_time": 40.89681404599105 }, { "clip_ratio/high_max": 0.003468631795840338, "clip_ratio/high_mean": 0.003468631795840338, "clip_ratio/low_mean": 0.004847346048336476, "clip_ratio/low_min": 0.004847346048336476, "clip_ratio/region_mean": 0.008315977815072984, "completions/clipped_ratio": 0.0, "completions/max_length": 3032.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 2865.625, "completions/mean_terminated_length": 2865.625, "completions/min_length": 2775.0, "completions/min_terminated_length": 2775.0, "entropy": 0.028257734375074506, "epoch": 4.532000181280007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004565223585814238, "kl": 0.011363932280801237, "learning_rate": 7.462979993352058e-06, "loss": 0.0002, "num_tokens": 39409551.0, "reward": -1.2572319507598877, "reward_std": 10.407297134399414, "rewards/rollout_reward_func/mean": -1.2572319507598877, "rewards/rollout_reward_func/std": 10.407297134399414, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.7734375, "sampling/sampling_logp_difference/mean": 0.2478354424238205, "step": 1133, "step_time": 39.72935596102616 }, { "clip_ratio/high_max": 0.002644268883159384, "clip_ratio/high_mean": 0.002644268883159384, "clip_ratio/low_mean": 0.005389810277847573, "clip_ratio/low_min": 0.005389810277847573, "clip_ratio/region_mean": 0.008034079044591635, "completions/clipped_ratio": 0.0, "completions/max_length": 3124.0, "completions/max_terminated_length": 3124.0, "completions/mean_length": 2864.375, "completions/mean_terminated_length": 2864.375, "completions/min_length": 2199.0, "completions/min_terminated_length": 2199.0, "entropy": 0.029427307890728116, "epoch": 4.536000181440007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004762208089232445, "kl": 0.01092692808015272, "learning_rate": 7.462979993339932e-06, "loss": 0.0002, "num_tokens": 39468288.0, "reward": 7.454254627227783, "reward_std": 17.072885513305664, "rewards/rollout_reward_func/mean": 7.454254627227783, "rewards/rollout_reward_func/std": 17.072887420654297, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.34375, "sampling/sampling_logp_difference/mean": 0.2513347566127777, "step": 1134, "step_time": 40.3479382309888 }, { "clip_ratio/high_max": 0.004066094697918743, "clip_ratio/high_mean": 0.004066094697918743, "clip_ratio/low_mean": 0.004937126359436661, "clip_ratio/low_min": 0.004937126359436661, "clip_ratio/region_mean": 0.009003221057355404, "completions/clipped_ratio": 0.0, "completions/max_length": 3129.0, "completions/max_terminated_length": 3129.0, "completions/mean_length": 2921.9375, "completions/mean_terminated_length": 2921.9375, "completions/min_length": 2797.0, "completions/min_terminated_length": 2797.0, "entropy": 0.030015479307621717, "epoch": 4.5400001816000074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035080520901829004, "kl": 0.01014067477080971, "learning_rate": 7.462979993327796e-06, "loss": 0.0001, "num_tokens": 39527939.0, "reward": -2.6167044639587402, "reward_std": 10.948838233947754, "rewards/rollout_reward_func/mean": -2.6167044639587402, "rewards/rollout_reward_func/std": 10.948838233947754, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.703125, "sampling/sampling_logp_difference/mean": 0.25527456402778625, "step": 1135, "step_time": 40.61046130399336 }, { "clip_ratio/high_max": 0.003830699250102043, "clip_ratio/high_mean": 0.003830699250102043, "clip_ratio/low_mean": 0.004250006139045581, "clip_ratio/low_min": 0.004250006139045581, "clip_ratio/region_mean": 0.008080705418251455, "completions/clipped_ratio": 0.0, "completions/max_length": 3145.0, "completions/max_terminated_length": 3145.0, "completions/mean_length": 2958.6875, "completions/mean_terminated_length": 2958.6875, "completions/min_length": 2792.0, "completions/min_terminated_length": 2792.0, "entropy": 0.027025610441341996, "epoch": 4.544000181760007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.006130774971097708, "kl": 0.01206868002191186, "learning_rate": 7.462979993315649e-06, "loss": 0.0002, "num_tokens": 39588194.0, "reward": -3.4846749305725098, "reward_std": 12.387081146240234, "rewards/rollout_reward_func/mean": -3.4846749305725098, "rewards/rollout_reward_func/std": 12.387081146240234, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.6875, "sampling/sampling_logp_difference/mean": 0.23764173686504364, "step": 1136, "step_time": 40.52185126300901 }, { "clip_ratio/high_max": 0.001700538421573583, "clip_ratio/high_mean": 0.001700538421573583, "clip_ratio/low_mean": 0.005334973073331639, "clip_ratio/low_min": 0.005334973073331639, "clip_ratio/region_mean": 0.0070355115458369255, "completions/clipped_ratio": 0.0, "completions/max_length": 3131.0, "completions/max_terminated_length": 3131.0, "completions/mean_length": 2833.0, "completions/mean_terminated_length": 2833.0, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.027989310445263982, "epoch": 4.5480001819200076e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.009886374697089195, "kl": 0.011292398790828884, "learning_rate": 7.4629799933034895e-06, "loss": 0.0002, "num_tokens": 39646453.0, "reward": 4.5803680419921875, "reward_std": 24.835058212280273, "rewards/rollout_reward_func/mean": 4.5803680419921875, "rewards/rollout_reward_func/std": 24.835058212280273, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.21875, "sampling/sampling_logp_difference/mean": 0.23819804191589355, "step": 1137, "step_time": 40.32175385201117 }, { "clip_ratio/high_max": 0.004552947357296944, "clip_ratio/high_mean": 0.004552947357296944, "clip_ratio/low_mean": 0.003092106431722641, "clip_ratio/low_min": 0.003092106431722641, "clip_ratio/region_mean": 0.007645053789019585, "completions/clipped_ratio": 0.0, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 2871.5, "completions/mean_terminated_length": 2871.5, "completions/min_length": 2747.0, "completions/min_terminated_length": 2747.0, "entropy": 0.029992833267897367, "epoch": 4.552000182080007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002600339474156499, "kl": 0.009254757198505104, "learning_rate": 7.462979993291319e-06, "loss": 0.0001, "num_tokens": 39705285.0, "reward": -3.556703567504883, "reward_std": 10.868206024169922, "rewards/rollout_reward_func/mean": -3.556703567504883, "rewards/rollout_reward_func/std": 10.868206977844238, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.375, "sampling/sampling_logp_difference/mean": 0.24728351831436157, "step": 1138, "step_time": 39.822469483013265 }, { "clip_ratio/high_max": 0.004495394532568753, "clip_ratio/high_mean": 0.004495394532568753, "clip_ratio/low_mean": 0.0031220346863847226, "clip_ratio/low_min": 0.0031220346863847226, "clip_ratio/region_mean": 0.007617429248057306, "completions/clipped_ratio": 0.0, "completions/max_length": 3115.0, "completions/max_terminated_length": 3115.0, "completions/mean_length": 2982.375, "completions/mean_terminated_length": 2982.375, "completions/min_length": 2854.0, "completions/min_terminated_length": 2854.0, "entropy": 0.026977488538250327, "epoch": 4.556000182240007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003988762851804495, "kl": 0.009641270968131721, "learning_rate": 7.462979993279138e-06, "loss": 0.0001, "num_tokens": 39765938.0, "reward": -1.3458229303359985, "reward_std": 14.436697006225586, "rewards/rollout_reward_func/mean": -1.3458229303359985, "rewards/rollout_reward_func/std": 14.436697006225586, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.25, "sampling/sampling_logp_difference/mean": 0.23892664909362793, "step": 1139, "step_time": 40.688422118037124 }, { "clip_ratio/high_max": 0.004754371097078547, "clip_ratio/high_mean": 0.004754371097078547, "clip_ratio/low_mean": 0.0038468815910164267, "clip_ratio/low_min": 0.0038468815910164267, "clip_ratio/region_mean": 0.008601252746302634, "completions/clipped_ratio": 0.0, "completions/max_length": 3090.0, "completions/max_terminated_length": 3090.0, "completions/mean_length": 2859.25, "completions/mean_terminated_length": 2859.25, "completions/min_length": 2539.0, "completions/min_terminated_length": 2539.0, "entropy": 0.031247781589627266, "epoch": 4.5600001824000075e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0044464608654379845, "kl": 0.010859393223654479, "learning_rate": 7.462979993266946e-06, "loss": 0.0002, "num_tokens": 39824566.0, "reward": 5.94436502456665, "reward_std": 24.376567840576172, "rewards/rollout_reward_func/mean": 5.94436502456665, "rewards/rollout_reward_func/std": 24.376567840576172, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.5234375, "sampling/sampling_logp_difference/mean": 0.2590312659740448, "step": 1140, "step_time": 40.30582688601862 }, { "clip_ratio/high_max": 0.00502129050437361, "clip_ratio/high_mean": 0.00502129050437361, "clip_ratio/low_mean": 0.003340890514664352, "clip_ratio/low_min": 0.003340890514664352, "clip_ratio/region_mean": 0.008362181019037962, "completions/clipped_ratio": 0.0, "completions/max_length": 2972.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 2825.8125, "completions/mean_terminated_length": 2825.8125, "completions/min_length": 2755.0, "completions/min_terminated_length": 2755.0, "entropy": 0.02927029551938176, "epoch": 4.564000182560007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0051966565661132336, "kl": 0.010974224540404975, "learning_rate": 7.462979993254744e-06, "loss": 0.0002, "num_tokens": 39882638.0, "reward": -3.332322597503662, "reward_std": 8.279090881347656, "rewards/rollout_reward_func/mean": -3.332322597503662, "rewards/rollout_reward_func/std": 8.279091835021973, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 54.1875, "sampling/sampling_logp_difference/mean": 0.2505646049976349, "step": 1141, "step_time": 39.85093061599764 }, { "clip_ratio/high_max": 0.0018848936742870137, "clip_ratio/high_mean": 0.0018848936742870137, "clip_ratio/low_mean": 0.005368250247556716, "clip_ratio/low_min": 0.005368250247556716, "clip_ratio/region_mean": 0.007253143936395645, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 2522.0, "completions/mean_terminated_length": 2522.0, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.030815803445875645, "epoch": 4.5680001827200076e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003712921403348446, "kl": 0.011228892544750124, "learning_rate": 7.462979993242529e-06, "loss": 0.0001, "num_tokens": 39935862.0, "reward": 7.39300012588501, "reward_std": 36.20309829711914, "rewards/rollout_reward_func/mean": 7.39300012588501, "rewards/rollout_reward_func/std": 36.203102111816406, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.78125, "sampling/sampling_logp_difference/mean": 0.2456284612417221, "step": 1142, "step_time": 37.127002981011174 }, { "clip_ratio/high_max": 0.0031760580895934254, "clip_ratio/high_mean": 0.0031760580895934254, "clip_ratio/low_mean": 0.004594007186824456, "clip_ratio/low_min": 0.004594007186824456, "clip_ratio/region_mean": 0.007770065276417881, "completions/clipped_ratio": 0.0, "completions/max_length": 3121.0, "completions/max_terminated_length": 3121.0, "completions/mean_length": 2926.25, "completions/mean_terminated_length": 2926.25, "completions/min_length": 2658.0, "completions/min_terminated_length": 2658.0, "entropy": 0.030251070857048035, "epoch": 4.5720001828800074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008894425816833973, "kl": 0.01120578357949853, "learning_rate": 7.462979993230304e-06, "loss": 0.0002, "num_tokens": 39995590.0, "reward": -4.586951732635498, "reward_std": 10.360692977905273, "rewards/rollout_reward_func/mean": -4.586951732635498, "rewards/rollout_reward_func/std": 10.360692977905273, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.875, "sampling/sampling_logp_difference/mean": 0.25252485275268555, "step": 1143, "step_time": 40.59115296899108 }, { "clip_ratio/high_max": 0.005192659504245967, "clip_ratio/high_mean": 0.005192659504245967, "clip_ratio/low_mean": 0.0032773078710306436, "clip_ratio/low_min": 0.0032773078710306436, "clip_ratio/region_mean": 0.00846996740438044, "completions/clipped_ratio": 0.0, "completions/max_length": 3137.0, "completions/max_terminated_length": 3137.0, "completions/mean_length": 2988.4375, "completions/mean_terminated_length": 2988.4375, "completions/min_length": 2809.0, "completions/min_terminated_length": 2809.0, "entropy": 0.027898499043658376, "epoch": 4.576000183040007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002661196980625391, "kl": 0.010630888398736715, "learning_rate": 7.462979993218067e-06, "loss": 0.0002, "num_tokens": 40056332.0, "reward": -8.239089012145996, "reward_std": 12.200251579284668, "rewards/rollout_reward_func/mean": -8.239089012145996, "rewards/rollout_reward_func/std": 12.200250625610352, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.266693115234375, "sampling/sampling_logp_difference/mean": 0.24144473671913147, "step": 1144, "step_time": 40.12135531398235 }, { "clip_ratio/high_max": 0.004658983729314059, "clip_ratio/high_mean": 0.004658983729314059, "clip_ratio/low_mean": 0.004497559275478125, "clip_ratio/low_min": 0.004497559275478125, "clip_ratio/region_mean": 0.009156542946584523, "completions/clipped_ratio": 0.0, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 2710.8125, "completions/mean_terminated_length": 2710.8125, "completions/min_length": 1292.0, "completions/min_terminated_length": 1292.0, "entropy": 0.03105781809426844, "epoch": 4.5800001832000076e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003167609916999936, "kl": 0.01033459702739492, "learning_rate": 7.462979993205821e-06, "loss": 0.0001, "num_tokens": 40112572.0, "reward": 14.356964111328125, "reward_std": 45.09008026123047, "rewards/rollout_reward_func/mean": 14.356964111328125, "rewards/rollout_reward_func/std": 45.09008026123047, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.953125, "sampling/sampling_logp_difference/mean": 0.2610906958580017, "step": 1145, "step_time": 38.851082786975894 }, { "clip_ratio/high_max": 0.002788592057186179, "clip_ratio/high_mean": 0.002788592057186179, "clip_ratio/low_mean": 0.005082543095340952, "clip_ratio/low_min": 0.005082543095340952, "clip_ratio/region_mean": 0.007871135021559894, "completions/clipped_ratio": 0.0, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 2707.375, "completions/mean_terminated_length": 2707.375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.02931172028183937, "epoch": 4.584000183360007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002540882211178541, "kl": 0.011543821659870446, "learning_rate": 7.462979993193562e-06, "loss": 0.0001, "num_tokens": 40168777.0, "reward": 6.2919230461120605, "reward_std": 27.18494415283203, "rewards/rollout_reward_func/mean": 6.2919230461120605, "rewards/rollout_reward_func/std": 27.1849422454834, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.0, "sampling/sampling_logp_difference/mean": 0.2520768344402313, "step": 1146, "step_time": 38.43729094398441 }, { "clip_ratio/high_max": 0.004793673870153725, "clip_ratio/high_mean": 0.004793673870153725, "clip_ratio/low_mean": 0.00385130982613191, "clip_ratio/low_min": 0.00385130982613191, "clip_ratio/region_mean": 0.008644983696285635, "completions/clipped_ratio": 0.0, "completions/max_length": 3094.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 2910.9375, "completions/mean_terminated_length": 2910.9375, "completions/min_length": 1669.0, "completions/min_terminated_length": 1669.0, "entropy": 0.028663334203884006, "epoch": 4.588000183520007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038019411731511354, "kl": 0.011038169264793396, "learning_rate": 7.462979993181293e-06, "loss": 0.0002, "num_tokens": 40228272.0, "reward": 7.810830116271973, "reward_std": 33.42228317260742, "rewards/rollout_reward_func/mean": 7.810830116271973, "rewards/rollout_reward_func/std": 33.42228317260742, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.65625, "sampling/sampling_logp_difference/mean": 0.24475394189357758, "step": 1147, "step_time": 40.39022591296816 }, { "clip_ratio/high_max": 0.003260575787862763, "clip_ratio/high_mean": 0.003260575787862763, "clip_ratio/low_mean": 0.004613851517206058, "clip_ratio/low_min": 0.004613851517206058, "clip_ratio/region_mean": 0.00787442730506882, "completions/clipped_ratio": 0.0, "completions/max_length": 3073.0, "completions/max_terminated_length": 3073.0, "completions/mean_length": 2898.75, "completions/mean_terminated_length": 2898.75, "completions/min_length": 2578.0, "completions/min_terminated_length": 2578.0, "entropy": 0.02997620007954538, "epoch": 4.5920001836800075e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013337080366909504, "kl": 0.012630662298761308, "learning_rate": 7.462979993169013e-06, "loss": 0.0002, "num_tokens": 40287545.0, "reward": 6.819404602050781, "reward_std": 33.99079513549805, "rewards/rollout_reward_func/mean": 6.819404602050781, "rewards/rollout_reward_func/std": 33.99079513549805, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.6875, "sampling/sampling_logp_difference/mean": 0.2554680109024048, "step": 1148, "step_time": 40.50682961200073 }, { "clip_ratio/high_max": 0.003438347688643262, "clip_ratio/high_mean": 0.003438347688643262, "clip_ratio/low_mean": 0.004396200194605626, "clip_ratio/low_min": 0.004396200194605626, "clip_ratio/region_mean": 0.007834547781385481, "completions/clipped_ratio": 0.0, "completions/max_length": 3094.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 2868.625, "completions/mean_terminated_length": 2868.625, "completions/min_length": 2225.0, "completions/min_terminated_length": 2225.0, "entropy": 0.03063533012755215, "epoch": 4.596000183840007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007721225265413523, "kl": 0.013317429344169796, "learning_rate": 7.462979993156721e-06, "loss": 0.0002, "num_tokens": 40346351.0, "reward": 6.345963478088379, "reward_std": 38.23789978027344, "rewards/rollout_reward_func/mean": 6.345963478088379, "rewards/rollout_reward_func/std": 38.23789978027344, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.625, "sampling/sampling_logp_difference/mean": 0.2551206350326538, "step": 1149, "step_time": 40.361112240017974 }, { "clip_ratio/high_max": 0.0043720368994399905, "clip_ratio/high_mean": 0.0043720368994399905, "clip_ratio/low_mean": 0.0032608692126814276, "clip_ratio/low_min": 0.0032608692126814276, "clip_ratio/region_mean": 0.007632906083017588, "completions/clipped_ratio": 0.0, "completions/max_length": 3115.0, "completions/max_terminated_length": 3115.0, "completions/mean_length": 2908.9375, "completions/mean_terminated_length": 2908.9375, "completions/min_length": 2762.0, "completions/min_terminated_length": 2762.0, "entropy": 0.02929530362598598, "epoch": 4.6000001840000076e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013253497891128063, "kl": 0.011856755008921027, "learning_rate": 7.4629799931444175e-06, "loss": 0.0002, "num_tokens": 40405803.0, "reward": 1.7607250213623047, "reward_std": 15.754293441772461, "rewards/rollout_reward_func/mean": 1.7607250213623047, "rewards/rollout_reward_func/std": 15.754293441772461, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 47.41415023803711, "sampling/sampling_logp_difference/mean": 0.2514311969280243, "step": 1150, "step_time": 40.50367258899496 }, { "clip_ratio/high_max": 0.004607453563949093, "clip_ratio/high_mean": 0.004607453563949093, "clip_ratio/low_mean": 0.0028757501568179578, "clip_ratio/low_min": 0.0028757501568179578, "clip_ratio/region_mean": 0.007483203778974712, "completions/clipped_ratio": 0.0, "completions/max_length": 3134.0, "completions/max_terminated_length": 3134.0, "completions/mean_length": 3008.75, "completions/mean_terminated_length": 3008.75, "completions/min_length": 2885.0, "completions/min_terminated_length": 2885.0, "entropy": 0.027717436430975795, "epoch": 4.6040001841600074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.022906947880983353, "kl": 0.01086330134421587, "learning_rate": 7.462979993132105e-06, "loss": 0.0002, "num_tokens": 40466877.0, "reward": -7.3199076652526855, "reward_std": 12.250916481018066, "rewards/rollout_reward_func/mean": -7.3199076652526855, "rewards/rollout_reward_func/std": 12.25091552734375, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.234375, "sampling/sampling_logp_difference/mean": 0.23854205012321472, "step": 1151, "step_time": 40.9355768320238 }, { "clip_ratio/high_max": 0.0035130659234710038, "clip_ratio/high_mean": 0.0035130659234710038, "clip_ratio/low_mean": 0.004559829976642504, "clip_ratio/low_min": 0.004559829976642504, "clip_ratio/region_mean": 0.008072895871009678, "completions/clipped_ratio": 0.0, "completions/max_length": 3081.0, "completions/max_terminated_length": 3081.0, "completions/mean_length": 2821.875, "completions/mean_terminated_length": 2821.875, "completions/min_length": 2050.0, "completions/min_terminated_length": 2050.0, "entropy": 0.030069761211052537, "epoch": 4.608000184320007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010657713748514652, "kl": 0.01310994103550911, "learning_rate": 7.46297999311978e-06, "loss": 0.0002, "num_tokens": 40524923.0, "reward": 7.265925407409668, "reward_std": 35.219459533691406, "rewards/rollout_reward_func/mean": 7.265925407409668, "rewards/rollout_reward_func/std": 35.21946334838867, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 57.9921875, "sampling/sampling_logp_difference/mean": 0.25268006324768066, "step": 1152, "step_time": 40.16515934497875 }, { "clip_ratio/high_max": 0.0037266449362505227, "clip_ratio/high_mean": 0.0037266449362505227, "clip_ratio/low_mean": 0.004588584532029927, "clip_ratio/low_min": 0.004588584532029927, "clip_ratio/region_mean": 0.008315229439176619, "completions/clipped_ratio": 0.0, "completions/max_length": 2925.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 2828.625, "completions/mean_terminated_length": 2828.625, "completions/min_length": 2772.0, "completions/min_terminated_length": 2772.0, "entropy": 0.029311845311895013, "epoch": 4.6120001844800075e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0054502482526004314, "kl": 0.010892764898017049, "learning_rate": 7.4629799931074446e-06, "loss": 0.0002, "num_tokens": 40583050.0, "reward": 1.4828605651855469, "reward_std": 13.687607765197754, "rewards/rollout_reward_func/mean": 1.4828605651855469, "rewards/rollout_reward_func/std": 13.687607765197754, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.375, "sampling/sampling_logp_difference/mean": 0.254142701625824, "step": 1153, "step_time": 39.14239855398773 }, { "clip_ratio/high_max": 0.004130814573727548, "clip_ratio/high_mean": 0.004130814573727548, "clip_ratio/low_mean": 0.0031824267061892897, "clip_ratio/low_min": 0.0031824267061892897, "clip_ratio/region_mean": 0.007313241309020668, "completions/clipped_ratio": 0.0, "completions/max_length": 3086.0, "completions/max_terminated_length": 3086.0, "completions/mean_length": 3007.0625, "completions/mean_terminated_length": 3007.0625, "completions/min_length": 2892.0, "completions/min_terminated_length": 2892.0, "entropy": 0.0274879215285182, "epoch": 4.616000184640007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004334550816565752, "kl": 0.01045045821228996, "learning_rate": 7.462979993095097e-06, "loss": 0.0002, "num_tokens": 40644085.0, "reward": -0.009775102138519287, "reward_std": 12.17680549621582, "rewards/rollout_reward_func/mean": -0.009775102138519287, "rewards/rollout_reward_func/std": 12.17680549621582, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.59386444091797, "sampling/sampling_logp_difference/mean": 0.2418825626373291, "step": 1154, "step_time": 40.6092134250066 }, { "clip_ratio/high_max": 0.004569350276142359, "clip_ratio/high_mean": 0.004569350276142359, "clip_ratio/low_mean": 0.0035604301665443927, "clip_ratio/low_min": 0.0035604301665443927, "clip_ratio/region_mean": 0.008129780471790582, "completions/clipped_ratio": 0.0, "completions/max_length": 3002.0, "completions/max_terminated_length": 3002.0, "completions/mean_length": 2883.5625, "completions/mean_terminated_length": 2883.5625, "completions/min_length": 2781.0, "completions/min_terminated_length": 2781.0, "entropy": 0.028544431552290916, "epoch": 4.620000184800008e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036906199529767036, "kl": 0.011834462988190353, "learning_rate": 7.46297999308274e-06, "loss": 0.0002, "num_tokens": 40703111.0, "reward": -1.5353028774261475, "reward_std": 8.592306137084961, "rewards/rollout_reward_func/mean": -1.5353028774261475, "rewards/rollout_reward_func/std": 8.592306137084961, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.0, "sampling/sampling_logp_difference/mean": 0.25135716795921326, "step": 1155, "step_time": 39.87147620503674 }, { "clip_ratio/high_max": 0.0044003131333738565, "clip_ratio/high_mean": 0.0044003131333738565, "clip_ratio/low_mean": 0.0036714644520543516, "clip_ratio/low_min": 0.0036714644520543516, "clip_ratio/region_mean": 0.008071777585428208, "completions/clipped_ratio": 0.0, "completions/max_length": 3077.0, "completions/max_terminated_length": 3077.0, "completions/mean_length": 2798.75, "completions/mean_terminated_length": 2798.75, "completions/min_length": 1716.0, "completions/min_terminated_length": 1716.0, "entropy": 0.029275903711095452, "epoch": 4.6240001849600074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037243522237986326, "kl": 0.01138096721842885, "learning_rate": 7.462979993070372e-06, "loss": 0.0002, "num_tokens": 40760758.0, "reward": -1.0567803382873535, "reward_std": 29.631628036499023, "rewards/rollout_reward_func/mean": -1.0567803382873535, "rewards/rollout_reward_func/std": 29.63162612915039, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.75, "sampling/sampling_logp_difference/mean": 0.25216642022132874, "step": 1156, "step_time": 39.980955090984935 }, { "clip_ratio/high_max": 0.0031907888769637793, "clip_ratio/high_mean": 0.0031907888769637793, "clip_ratio/low_mean": 0.0038841864734422415, "clip_ratio/low_min": 0.0038841864734422415, "clip_ratio/region_mean": 0.007074975350406021, "completions/clipped_ratio": 0.0, "completions/max_length": 3149.0, "completions/max_terminated_length": 3149.0, "completions/mean_length": 2935.1875, "completions/mean_terminated_length": 2935.1875, "completions/min_length": 2474.0, "completions/min_terminated_length": 2474.0, "entropy": 0.030149550177156925, "epoch": 4.628000185120007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.020319871604442596, "kl": 0.013720512622967362, "learning_rate": 7.462979993057991e-06, "loss": 0.0002, "num_tokens": 40820661.0, "reward": 6.498544692993164, "reward_std": 27.408428192138672, "rewards/rollout_reward_func/mean": 6.498544692993164, "rewards/rollout_reward_func/std": 27.408428192138672, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.859375, "sampling/sampling_logp_difference/mean": 0.2540605664253235, "step": 1157, "step_time": 39.91486641598749 }, { "clip_ratio/high_max": 0.003299119067378342, "clip_ratio/high_mean": 0.003299119067378342, "clip_ratio/low_mean": 0.004185107653029263, "clip_ratio/low_min": 0.004185107653029263, "clip_ratio/region_mean": 0.007484226720407605, "completions/clipped_ratio": 0.0, "completions/max_length": 3079.0, "completions/max_terminated_length": 3079.0, "completions/mean_length": 2944.0625, "completions/mean_terminated_length": 2944.0625, "completions/min_length": 2196.0, "completions/min_terminated_length": 2196.0, "entropy": 0.029227406019344926, "epoch": 4.6320001852800076e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.017647547647356987, "kl": 0.01229812070960179, "learning_rate": 7.4629799930456e-06, "loss": 0.0002, "num_tokens": 40880694.0, "reward": 0.1764523983001709, "reward_std": 29.082921981811523, "rewards/rollout_reward_func/mean": 0.1764523983001709, "rewards/rollout_reward_func/std": 29.082921981811523, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 46.679691314697266, "sampling/sampling_logp_difference/mean": 0.2583094835281372, "step": 1158, "step_time": 44.505496575002326 }, { "clip_ratio/high_max": 0.003709060270921327, "clip_ratio/high_mean": 0.003709060270921327, "clip_ratio/low_mean": 0.0038665289466734976, "clip_ratio/low_min": 0.0038665289466734976, "clip_ratio/region_mean": 0.00757558923214674, "completions/clipped_ratio": 0.0, "completions/max_length": 3103.0, "completions/max_terminated_length": 3103.0, "completions/mean_length": 2794.875, "completions/mean_terminated_length": 2794.875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.029996360652148724, "epoch": 4.636000185440007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.003234527073800564, "kl": 0.011788391275331378, "learning_rate": 7.462979993033197e-06, "loss": 0.0002, "num_tokens": 40938341.0, "reward": 3.605942964553833, "reward_std": 23.678743362426758, "rewards/rollout_reward_func/mean": 3.605942964553833, "rewards/rollout_reward_func/std": 23.67874526977539, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 56.84375, "sampling/sampling_logp_difference/mean": 0.2448730766773224, "step": 1159, "step_time": 39.73189358605305 }, { "clip_ratio/high_max": 0.003878979798173532, "clip_ratio/high_mean": 0.003878979798173532, "clip_ratio/low_mean": 0.004094508301932365, "clip_ratio/low_min": 0.004094508301932365, "clip_ratio/region_mean": 0.007973488012794405, "completions/clipped_ratio": 0.0, "completions/max_length": 3132.0, "completions/max_terminated_length": 3132.0, "completions/mean_length": 2888.0, "completions/mean_terminated_length": 2888.0, "completions/min_length": 2553.0, "completions/min_terminated_length": 2553.0, "entropy": 0.029991596471518278, "epoch": 4.640000185600008e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.038240645080804825, "kl": 0.01475513755576685, "learning_rate": 7.4629799930207855e-06, "loss": 0.0002, "num_tokens": 40997452.0, "reward": 7.084255218505859, "reward_std": 17.743450164794922, "rewards/rollout_reward_func/mean": 7.084255218505859, "rewards/rollout_reward_func/std": 17.743450164794922, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.880859375, "sampling/sampling_logp_difference/mean": 0.24986393749713898, "step": 1160, "step_time": 40.43785224900057 }, { "clip_ratio/high_max": 0.004198702663416043, "clip_ratio/high_mean": 0.004198702663416043, "clip_ratio/low_mean": 0.003582235280191526, "clip_ratio/low_min": 0.003582235280191526, "clip_ratio/region_mean": 0.007780937885399908, "completions/clipped_ratio": 0.0, "completions/max_length": 3080.0, "completions/max_terminated_length": 3080.0, "completions/mean_length": 2928.1875, "completions/mean_terminated_length": 2928.1875, "completions/min_length": 2827.0, "completions/min_terminated_length": 2827.0, "entropy": 0.02781710377894342, "epoch": 4.6440001857600075e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0063477326184511185, "kl": 0.011148714576847851, "learning_rate": 7.46297999300836e-06, "loss": 0.0002, "num_tokens": 41057202.0, "reward": -2.8782012462615967, "reward_std": 12.041494369506836, "rewards/rollout_reward_func/mean": -2.8782012462615967, "rewards/rollout_reward_func/std": 12.041495323181152, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.6875, "sampling/sampling_logp_difference/mean": 0.2450190931558609, "step": 1161, "step_time": 40.366675427008886 }, { "clip_ratio/high_max": 0.00466437271097675, "clip_ratio/high_mean": 0.00466437271097675, "clip_ratio/low_mean": 0.00370272746658884, "clip_ratio/low_min": 0.00370272746658884, "clip_ratio/region_mean": 0.008367100090254098, "completions/clipped_ratio": 0.0, "completions/max_length": 3029.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 2850.875, "completions/mean_terminated_length": 2850.875, "completions/min_length": 2075.0, "completions/min_terminated_length": 2075.0, "entropy": 0.030414930544793606, "epoch": 4.648000185920007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028547768015414476, "kl": 0.011402334901504219, "learning_rate": 7.462979992995927e-06, "loss": 0.0002, "num_tokens": 41115700.0, "reward": 4.64592170715332, "reward_std": 37.75726318359375, "rewards/rollout_reward_func/mean": 4.64592170715332, "rewards/rollout_reward_func/std": 37.757266998291016, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.417972564697266, "sampling/sampling_logp_difference/mean": 0.2577439546585083, "step": 1162, "step_time": 39.397713564001606 }, { "clip_ratio/high_max": 0.0026730296667665243, "clip_ratio/high_mean": 0.0026730296667665243, "clip_ratio/low_mean": 0.005067674530437216, "clip_ratio/low_min": 0.005067674530437216, "clip_ratio/region_mean": 0.00774070416809991, "completions/clipped_ratio": 0.0, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 2896.5, "completions/mean_terminated_length": 2896.5, "completions/min_length": 2779.0, "completions/min_terminated_length": 2779.0, "entropy": 0.028780633816495538, "epoch": 4.6520001860800077e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002479827031493187, "kl": 0.009391087805852294, "learning_rate": 7.46297999298348e-06, "loss": 0.0001, "num_tokens": 41174950.0, "reward": 1.005632996559143, "reward_std": 13.24874210357666, "rewards/rollout_reward_func/mean": 1.005632996559143, "rewards/rollout_reward_func/std": 13.24874210357666, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.125, "sampling/sampling_logp_difference/mean": 0.24497412145137787, "step": 1163, "step_time": 40.0930759619805 }, { "clip_ratio/high_max": 0.003784249856835231, "clip_ratio/high_mean": 0.003784249856835231, "clip_ratio/low_mean": 0.004459824820514768, "clip_ratio/low_min": 0.004459824820514768, "clip_ratio/region_mean": 0.00824407470645383, "completions/clipped_ratio": 0.0, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 2668.4375, "completions/mean_terminated_length": 2668.4375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.029371678363531828, "epoch": 4.6560001862400074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.008823228068649769, "kl": 0.015066442545503378, "learning_rate": 7.462979992971022e-06, "loss": 0.0002, "num_tokens": 41230530.0, "reward": 8.39982795715332, "reward_std": 41.933406829833984, "rewards/rollout_reward_func/mean": 8.39982795715332, "rewards/rollout_reward_func/std": 41.933414459228516, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.9492301940918, "sampling/sampling_logp_difference/mean": 0.25840169191360474, "step": 1164, "step_time": 38.47099609898578 }, { "clip_ratio/high_max": 0.0036978074349462986, "clip_ratio/high_mean": 0.0036978074349462986, "clip_ratio/low_mean": 0.004784470394952223, "clip_ratio/low_min": 0.004784470394952223, "clip_ratio/region_mean": 0.00848227774258703, "completions/clipped_ratio": 0.0, "completions/max_length": 3159.0, "completions/max_terminated_length": 3159.0, "completions/mean_length": 2867.625, "completions/mean_terminated_length": 2867.625, "completions/min_length": 1611.0, "completions/min_terminated_length": 1611.0, "entropy": 0.031404058216139674, "epoch": 4.660000186400007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00669747032225132, "kl": 0.01275855116546154, "learning_rate": 7.462979992958554e-06, "loss": 0.0002, "num_tokens": 41289320.0, "reward": 10.272603034973145, "reward_std": 32.965576171875, "rewards/rollout_reward_func/mean": 10.272603034973145, "rewards/rollout_reward_func/std": 32.965576171875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.4375, "sampling/sampling_logp_difference/mean": 0.25558218359947205, "step": 1165, "step_time": 40.05500500999915 }, { "clip_ratio/high_max": 0.00263928686035797, "clip_ratio/high_mean": 0.00263928686035797, "clip_ratio/low_mean": 0.0050163538253400475, "clip_ratio/low_min": 0.0050163538253400475, "clip_ratio/region_mean": 0.007655640598386526, "completions/clipped_ratio": 0.0, "completions/max_length": 3128.0, "completions/max_terminated_length": 3128.0, "completions/mean_length": 2969.0625, "completions/mean_terminated_length": 2969.0625, "completions/min_length": 2724.0, "completions/min_terminated_length": 2724.0, "entropy": 0.027685942128300667, "epoch": 4.6640001865600076e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.007109588012099266, "kl": 0.01160511007765308, "learning_rate": 7.462979992946075e-06, "loss": 0.0002, "num_tokens": 41349754.0, "reward": -1.2743542194366455, "reward_std": 17.08123779296875, "rewards/rollout_reward_func/mean": -1.2743542194366455, "rewards/rollout_reward_func/std": 17.08123779296875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.974613189697266, "sampling/sampling_logp_difference/mean": 0.2418103963136673, "step": 1166, "step_time": 40.87831059501332 }, { "clip_ratio/high_max": 0.003492521937005222, "clip_ratio/high_mean": 0.003492521937005222, "clip_ratio/low_mean": 0.00451277761021629, "clip_ratio/low_min": 0.00451277761021629, "clip_ratio/region_mean": 0.008005299663636833, "completions/clipped_ratio": 0.0, "completions/max_length": 3083.0, "completions/max_terminated_length": 3083.0, "completions/mean_length": 2923.625, "completions/mean_terminated_length": 2923.625, "completions/min_length": 2767.0, "completions/min_terminated_length": 2767.0, "entropy": 0.029201299883425236, "epoch": 4.668000186720007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004079150967299938, "kl": 0.01098755409475416, "learning_rate": 7.462979992933585e-06, "loss": 0.0002, "num_tokens": 41409436.0, "reward": -8.918243408203125, "reward_std": 12.422784805297852, "rewards/rollout_reward_func/mean": -8.918243408203125, "rewards/rollout_reward_func/std": 12.422784805297852, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 49.35938262939453, "sampling/sampling_logp_difference/mean": 0.24816495180130005, "step": 1167, "step_time": 40.630630613988615 }, { "clip_ratio/high_max": 0.003021705138962716, "clip_ratio/high_mean": 0.003021705138962716, "clip_ratio/low_mean": 0.00501624756725505, "clip_ratio/low_min": 0.00501624756725505, "clip_ratio/region_mean": 0.008037952706217766, "completions/clipped_ratio": 0.0, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 2655.4375, "completions/mean_terminated_length": 2655.4375, "completions/min_length": 1695.0, "completions/min_terminated_length": 1695.0, "entropy": 0.03164428239688277, "epoch": 4.672000186880008e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028937107417732477, "kl": 0.010533285851124674, "learning_rate": 7.462979992921084e-06, "loss": 0.0001, "num_tokens": 41464802.0, "reward": 23.729843139648438, "reward_std": 37.17552947998047, "rewards/rollout_reward_func/mean": 23.729843139648438, "rewards/rollout_reward_func/std": 37.175533294677734, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.851566314697266, "sampling/sampling_logp_difference/mean": 0.26933687925338745, "step": 1168, "step_time": 38.765353027993115 }, { "clip_ratio/high_max": 0.004255859996192157, "clip_ratio/high_mean": 0.004255859996192157, "clip_ratio/low_mean": 0.003795878932578489, "clip_ratio/low_min": 0.003795878932578489, "clip_ratio/region_mean": 0.008051738957874477, "completions/clipped_ratio": 0.0, "completions/max_length": 2903.0, "completions/max_terminated_length": 2903.0, "completions/mean_length": 2823.5625, "completions/mean_terminated_length": 2823.5625, "completions/min_length": 2730.0, "completions/min_terminated_length": 2730.0, "entropy": 0.030581956263631582, "epoch": 4.6760001870400075e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010757919400930405, "kl": 0.012661864922847599, "learning_rate": 7.4629799929085705e-06, "loss": 0.0002, "num_tokens": 41522860.0, "reward": -3.031346321105957, "reward_std": 9.89370346069336, "rewards/rollout_reward_func/mean": -3.031346321105957, "rewards/rollout_reward_func/std": 9.89370346069336, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.59375, "sampling/sampling_logp_difference/mean": 0.25499585270881653, "step": 1169, "step_time": 38.72327317202871 }, { "clip_ratio/high_max": 0.003336815076181665, "clip_ratio/high_mean": 0.003336815076181665, "clip_ratio/low_mean": 0.004186377773294225, "clip_ratio/low_min": 0.004186377773294225, "clip_ratio/region_mean": 0.007523192907683551, "completions/clipped_ratio": 0.0, "completions/max_length": 3068.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 2883.875, "completions/mean_terminated_length": 2883.875, "completions/min_length": 2751.0, "completions/min_terminated_length": 2751.0, "entropy": 0.029217303963378072, "epoch": 4.680000187200007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028399464208632708, "kl": 0.011006191314663738, "learning_rate": 7.462979992896047e-06, "loss": 0.0002, "num_tokens": 41581891.0, "reward": -0.7571680545806885, "reward_std": 11.923553466796875, "rewards/rollout_reward_func/mean": -0.7571680545806885, "rewards/rollout_reward_func/std": 11.923553466796875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 48.71875, "sampling/sampling_logp_difference/mean": 0.24939872324466705, "step": 1170, "step_time": 39.89605833297537 }, { "clip_ratio/high_max": 0.003201918414561078, "clip_ratio/high_mean": 0.003201918414561078, "clip_ratio/low_mean": 0.004873196128755808, "clip_ratio/low_min": 0.004873196128755808, "clip_ratio/region_mean": 0.008075114514213055, "completions/clipped_ratio": 0.0, "completions/max_length": 3107.0, "completions/max_terminated_length": 3107.0, "completions/mean_length": 2736.75, "completions/mean_terminated_length": 2736.75, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.029176110168918967, "epoch": 4.6840001873600076e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.016189631074666977, "kl": 0.013515510247088969, "learning_rate": 7.462979992883514e-06, "loss": 0.0002, "num_tokens": 41638591.0, "reward": 8.688665390014648, "reward_std": 38.75834274291992, "rewards/rollout_reward_func/mean": 8.688665390014648, "rewards/rollout_reward_func/std": 38.75834274291992, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 55.09375, "sampling/sampling_logp_difference/mean": 0.2487814575433731, "step": 1171, "step_time": 39.52850392500113 }, { "clip_ratio/high_max": 0.0024956036068033427, "clip_ratio/high_mean": 0.0024956036068033427, "clip_ratio/low_mean": 0.0046876985870767385, "clip_ratio/low_min": 0.0046876985870767385, "clip_ratio/region_mean": 0.007183302310295403, "completions/clipped_ratio": 0.0, "completions/max_length": 3093.0, "completions/max_terminated_length": 3093.0, "completions/mean_length": 2931.1875, "completions/mean_terminated_length": 2931.1875, "completions/min_length": 2786.0, "completions/min_terminated_length": 2786.0, "entropy": 0.02738692844286561, "epoch": 4.6880001875200074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00556230079382658, "kl": 0.010441187012474984, "learning_rate": 7.462979992870967e-06, "loss": 0.0001, "num_tokens": 41698394.0, "reward": -1.963935375213623, "reward_std": 9.404509544372559, "rewards/rollout_reward_func/mean": -1.963935375213623, "rewards/rollout_reward_func/std": 9.404509544372559, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.271121978759766, "sampling/sampling_logp_difference/mean": 0.24227985739707947, "step": 1172, "step_time": 40.387446843014914 }, { "clip_ratio/high_max": 0.0046785916783846915, "clip_ratio/high_mean": 0.0046785916783846915, "clip_ratio/low_mean": 0.0033757739583961666, "clip_ratio/low_min": 0.0033757739583961666, "clip_ratio/region_mean": 0.008054365578573197, "completions/clipped_ratio": 0.0, "completions/max_length": 2986.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 2890.9375, "completions/mean_terminated_length": 2890.9375, "completions/min_length": 2769.0, "completions/min_terminated_length": 2769.0, "entropy": 0.029864440904930234, "epoch": 4.692000187680008e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0088054109364748, "kl": 0.012289002537727356, "learning_rate": 7.46297999285841e-06, "loss": 0.0002, "num_tokens": 41757536.0, "reward": -1.9373100996017456, "reward_std": 17.843048095703125, "rewards/rollout_reward_func/mean": -1.9373100996017456, "rewards/rollout_reward_func/std": 17.843048095703125, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 44.09375, "sampling/sampling_logp_difference/mean": 0.2502417266368866, "step": 1173, "step_time": 39.62321156800317 }, { "clip_ratio/high_max": 0.004310344811528921, "clip_ratio/high_mean": 0.004310344811528921, "clip_ratio/low_mean": 0.004355633514933288, "clip_ratio/low_min": 0.004355633514933288, "clip_ratio/region_mean": 0.008665978268254548, "completions/clipped_ratio": 0.0, "completions/max_length": 3127.0, "completions/max_terminated_length": 3127.0, "completions/mean_length": 2922.5625, "completions/mean_terminated_length": 2922.5625, "completions/min_length": 2197.0, "completions/min_terminated_length": 2197.0, "entropy": 0.028897831216454506, "epoch": 4.6960001878400075e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.00810918863862753, "kl": 0.010544135642703623, "learning_rate": 7.462979992845843e-06, "loss": 0.0001, "num_tokens": 41817219.0, "reward": 1.8124690055847168, "reward_std": 35.54989242553711, "rewards/rollout_reward_func/mean": 1.8124690055847168, "rewards/rollout_reward_func/std": 35.549896240234375, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 51.625, "sampling/sampling_logp_difference/mean": 0.24860644340515137, "step": 1174, "step_time": 40.12509034400864 }, { "clip_ratio/high_max": 0.004372449882794172, "clip_ratio/high_mean": 0.004372449882794172, "clip_ratio/low_mean": 0.0040391829097643495, "clip_ratio/low_min": 0.0040391829097643495, "clip_ratio/region_mean": 0.008411632792558521, "completions/clipped_ratio": 0.0, "completions/max_length": 2962.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 2875.1875, "completions/mean_terminated_length": 2875.1875, "completions/min_length": 2571.0, "completions/min_terminated_length": 2571.0, "entropy": 0.03165979287587106, "epoch": 4.700000188000007e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.002971163485199213, "kl": 0.009898713266011328, "learning_rate": 7.462979992833264e-06, "loss": 0.0001, "num_tokens": 41876121.0, "reward": 3.257796287536621, "reward_std": 22.754371643066406, "rewards/rollout_reward_func/mean": 3.257796287536621, "rewards/rollout_reward_func/std": 22.75437355041504, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.828125, "sampling/sampling_logp_difference/mean": 0.2540276050567627, "step": 1175, "step_time": 39.43917061500542 }, { "clip_ratio/high_max": 0.0051336986653041095, "clip_ratio/high_mean": 0.0051336986653041095, "clip_ratio/low_mean": 0.0024597029405413195, "clip_ratio/low_min": 0.0024597029405413195, "clip_ratio/region_mean": 0.007593401707708836, "completions/clipped_ratio": 0.0, "completions/max_length": 3095.0, "completions/max_terminated_length": 3095.0, "completions/mean_length": 2926.3125, "completions/mean_terminated_length": 2926.3125, "completions/min_length": 2774.0, "completions/min_terminated_length": 2774.0, "entropy": 0.02900980832055211, "epoch": 4.704000188160008e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037605820689350367, "kl": 0.011238236678764224, "learning_rate": 7.462979992820674e-06, "loss": 0.0002, "num_tokens": 41935855.0, "reward": -6.729710102081299, "reward_std": 11.661005973815918, "rewards/rollout_reward_func/mean": -6.729710102081299, "rewards/rollout_reward_func/std": 11.661006927490234, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 50.0625, "sampling/sampling_logp_difference/mean": 0.24832940101623535, "step": 1176, "step_time": 40.69254967100278 }, { "clip_ratio/high_max": 0.003690052093588747, "clip_ratio/high_mean": 0.003690052093588747, "clip_ratio/low_mean": 0.0035397146712057292, "clip_ratio/low_min": 0.0035397146712057292, "clip_ratio/region_mean": 0.007229766808450222, "completions/clipped_ratio": 0.0, "completions/max_length": 3117.0, "completions/max_terminated_length": 3117.0, "completions/mean_length": 2766.5625, "completions/mean_terminated_length": 2766.5625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.031842791475355625, "epoch": 4.7080001883200074e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004396804608404636, "kl": 0.010733336675912142, "learning_rate": 7.4629799928080735e-06, "loss": 0.0001, "num_tokens": 41993032.0, "reward": -7.383099555969238, "reward_std": 30.228303909301758, "rewards/rollout_reward_func/mean": -7.383099555969238, "rewards/rollout_reward_func/std": 30.22830581665039, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 45.375, "sampling/sampling_logp_difference/mean": 0.2450803816318512, "step": 1177, "step_time": 39.92715589202999 }, { "clip_ratio/high_max": 0.003895459754858166, "clip_ratio/high_mean": 0.003895459754858166, "clip_ratio/low_mean": 0.004138125921599567, "clip_ratio/low_min": 0.004138125921599567, "clip_ratio/region_mean": 0.008033585734665394, "completions/clipped_ratio": 0.0, "completions/max_length": 3007.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 2784.3125, "completions/mean_terminated_length": 2784.3125, "completions/min_length": 1838.0, "completions/min_terminated_length": 1838.0, "entropy": 0.03320229961536825, "epoch": 4.712000188480008e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026397991459816694, "kl": 0.009752998303156346, "learning_rate": 7.462979992795462e-06, "loss": 0.0001, "num_tokens": 42050464.0, "reward": 9.546480178833008, "reward_std": 38.9598388671875, "rewards/rollout_reward_func/mean": 9.546480178833008, "rewards/rollout_reward_func/std": 38.9598388671875, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 52.6875, "sampling/sampling_logp_difference/mean": 0.2748919725418091, "step": 1178, "step_time": 39.842811187001644 }, { "clip_ratio/high_max": 0.004462939745280892, "clip_ratio/high_mean": 0.004462939745280892, "clip_ratio/low_mean": 0.004081754392245784, "clip_ratio/low_min": 0.004081754392245784, "clip_ratio/region_mean": 0.008544694166630507, "completions/clipped_ratio": 0.0, "completions/max_length": 3058.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 2937.625, "completions/mean_terminated_length": 2937.625, "completions/min_length": 2839.0, "completions/min_terminated_length": 2839.0, "entropy": 0.02986436290666461, "epoch": 4.7160001886400076e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0028842284809798002, "kl": 0.010258924856316298, "learning_rate": 7.462979992782839e-06, "loss": 0.0001, "num_tokens": 42110377.0, "reward": 3.8619601726531982, "reward_std": 20.093698501586914, "rewards/rollout_reward_func/mean": 3.8619601726531982, "rewards/rollout_reward_func/std": 20.093698501586914, "sampling/importance_sampling_ratio/max": 0.0, "sampling/importance_sampling_ratio/mean": 0.0, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 53.3671875, "sampling/sampling_logp_difference/mean": 0.2473163604736328, "step": 1179, "step_time": 39.66258531999483 } ], "logging_steps": 1.0, "max_steps": 49999998, "num_input_tokens_seen": 42110377, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }