{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 179.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0008, "grad_norm": 5.598288059234619, "kl": 0.0005154609680175781, "learning_rate": 1.5873015873015872e-08, "loss": 0.0537, "num_tokens": 15100.0, "reward": 0.04846250265836716, "reward_std": 0.06843117624521255, "rewards/bleu_reward_func/mean": 0.04846250265836716, "rewards/bleu_reward_func/std": 0.07639143615961075, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 248.09375, "completions/mean_terminated_length": 128.13636779785156, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0016, "grad_norm": 7.323095321655273, "kl": 0.0005979537963867188, "learning_rate": 3.1746031746031744e-08, "loss": 0.2393, "num_tokens": 31479.0, "reward": 0.03515050560235977, "reward_std": 0.0315697155892849, "rewards/bleu_reward_func/mean": 0.03515050560235977, "rewards/bleu_reward_func/std": 0.048244670033454895, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 258.34375, "completions/mean_terminated_length": 159.0869598388672, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0024, "grad_norm": 5.801818370819092, "kl": 0.0008335113525390625, "learning_rate": 4.7619047619047613e-08, "loss": 0.2227, "num_tokens": 47330.0, "reward": 0.0770750418305397, "reward_std": 0.05211775749921799, "rewards/bleu_reward_func/mean": 0.0770750418305397, "rewards/bleu_reward_func/std": 0.07082299888134003, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 285.59375, "completions/mean_terminated_length": 197.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0032, "grad_norm": 6.7342329025268555, "kl": 0.0007953643798828125, "learning_rate": 6.349206349206349e-08, "loss": 0.1714, "num_tokens": 62101.0, "reward": 0.05630416050553322, "reward_std": 0.0387054979801178, "rewards/bleu_reward_func/mean": 0.05630416050553322, "rewards/bleu_reward_func/std": 0.05173136293888092, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 312.75, "completions/mean_terminated_length": 208.38095092773438, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.004, "grad_norm": 4.261541843414307, "kl": 0.0007123947143554688, "learning_rate": 7.936507936507936e-08, "loss": 0.0096, "num_tokens": 74629.0, "reward": 0.03661263734102249, "reward_std": 0.02765350043773651, "rewards/bleu_reward_func/mean": 0.03661263734102249, "rewards/bleu_reward_func/std": 0.05122661218047142, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 269.0625, "completions/mean_terminated_length": 80.11111450195312, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0048, "grad_norm": 25.550201416015625, "kl": 0.000881195068359375, "learning_rate": 9.523809523809523e-08, "loss": -0.1788, "num_tokens": 91711.0, "reward": 0.01917407289147377, "reward_std": 0.014019257389008999, "rewards/bleu_reward_func/mean": 0.01917407289147377, "rewards/bleu_reward_func/std": 0.024173468351364136, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 377.8125, "completions/mean_terminated_length": 259.4117736816406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0056, "grad_norm": 3.7816717624664307, "kl": 0.0007390975952148438, "learning_rate": 1.111111111111111e-07, "loss": -0.2289, "num_tokens": 107369.0, "reward": 0.02209433726966381, "reward_std": 0.011734157800674438, "rewards/bleu_reward_func/mean": 0.02209433726966381, "rewards/bleu_reward_func/std": 0.023080473765730858, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 283.5625, "completions/mean_terminated_length": 146.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0064, "grad_norm": 4.340329647064209, "kl": 0.000911712646484375, "learning_rate": 1.2698412698412698e-07, "loss": -0.0252, "num_tokens": 125275.0, "reward": 0.03392016887664795, "reward_std": 0.04013249650597572, "rewards/bleu_reward_func/mean": 0.03392016887664795, "rewards/bleu_reward_func/std": 0.05353143438696861, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 282.09375, "completions/mean_terminated_length": 192.13043212890625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0072, "grad_norm": 5.671853542327881, "kl": 0.000640869140625, "learning_rate": 1.4285714285714285e-07, "loss": -0.4792, "num_tokens": 142190.0, "reward": 0.02354184165596962, "reward_std": 0.015565130859613419, "rewards/bleu_reward_func/mean": 0.02354184165596962, "rewards/bleu_reward_func/std": 0.02305246703326702, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 421.625, "completions/mean_terminated_length": 359.78948974609375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.008, "grad_norm": 3.240866184234619, "kl": 0.0006823539733886719, "learning_rate": 1.5873015873015872e-07, "loss": -0.0021, "num_tokens": 158282.0, "reward": 0.02482026070356369, "reward_std": 0.0131409652531147, "rewards/bleu_reward_func/mean": 0.02482026070356369, "rewards/bleu_reward_func/std": 0.015270248055458069, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 345.90625, "completions/mean_terminated_length": 199.35293579101562, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0088, "grad_norm": 3.652275800704956, "kl": 0.0006260871887207031, "learning_rate": 1.7460317460317458e-07, "loss": -0.2852, "num_tokens": 177455.0, "reward": 0.03390186280012131, "reward_std": 0.016770539805293083, "rewards/bleu_reward_func/mean": 0.03390186280012131, "rewards/bleu_reward_func/std": 0.04328485205769539, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 275.46875, "completions/mean_terminated_length": 196.625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0096, "grad_norm": 73.21807098388672, "kl": 0.0032701492309570312, "learning_rate": 1.9047619047619045e-07, "loss": 0.0661, "num_tokens": 189486.0, "reward": 0.022345196455717087, "reward_std": 0.019753258675336838, "rewards/bleu_reward_func/mean": 0.022345196455717087, "rewards/bleu_reward_func/std": 0.020975911989808083, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 309.90625, "completions/mean_terminated_length": 188.65000915527344, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0104, "grad_norm": 5.3057379722595215, "kl": 0.0005865097045898438, "learning_rate": 2.0634920634920632e-07, "loss": -0.1972, "num_tokens": 203691.0, "reward": 0.031099505722522736, "reward_std": 0.04415294528007507, "rewards/bleu_reward_func/mean": 0.031099505722522736, "rewards/bleu_reward_func/std": 0.05319083109498024, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 211.5625, "completions/mean_terminated_length": 127.43999481201172, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0112, "grad_norm": 10.997786521911621, "kl": 0.0007891654968261719, "learning_rate": 2.222222222222222e-07, "loss": 0.005, "num_tokens": 220117.0, "reward": 0.07334433495998383, "reward_std": 0.05255947634577751, "rewards/bleu_reward_func/mean": 0.07334433495998383, "rewards/bleu_reward_func/std": 0.11127088218927383, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 322.71875, "completions/mean_terminated_length": 175.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.012, "grad_norm": 4.558916091918945, "kl": 0.000732421875, "learning_rate": 2.3809523809523806e-07, "loss": -0.1845, "num_tokens": 232508.0, "reward": 0.01538888644427061, "reward_std": 0.012768322601914406, "rewards/bleu_reward_func/mean": 0.01538888644427061, "rewards/bleu_reward_func/std": 0.01415330171585083, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 266.78125, "completions/mean_terminated_length": 138.33334350585938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0128, "grad_norm": 4.418691158294678, "kl": 0.0009031295776367188, "learning_rate": 2.5396825396825396e-07, "loss": 0.2931, "num_tokens": 246325.0, "reward": 0.04519380256533623, "reward_std": 0.047629594802856445, "rewards/bleu_reward_func/mean": 0.04519380256533623, "rewards/bleu_reward_func/std": 0.09796681255102158, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 375.375, "completions/mean_terminated_length": 238.75, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0136, "grad_norm": 4.360379219055176, "kl": 0.0007829666137695312, "learning_rate": 2.698412698412698e-07, "loss": 0.0392, "num_tokens": 262393.0, "reward": 0.02785748988389969, "reward_std": 0.02370397374033928, "rewards/bleu_reward_func/mean": 0.02785748988389969, "rewards/bleu_reward_func/std": 0.031648874282836914, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 341.625, "completions/mean_terminated_length": 171.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0144, "grad_norm": 4.028530597686768, "kl": 0.0006041526794433594, "learning_rate": 2.857142857142857e-07, "loss": -0.0867, "num_tokens": 276509.0, "reward": 0.03313319757580757, "reward_std": 0.026780985295772552, "rewards/bleu_reward_func/mean": 0.03313319757580757, "rewards/bleu_reward_func/std": 0.03177988529205322, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 323.78125, "completions/mean_terminated_length": 250.13043212890625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0152, "grad_norm": 6.029330253601074, "kl": 0.0007543563842773438, "learning_rate": 3.0158730158730156e-07, "loss": 0.2177, "num_tokens": 288774.0, "reward": 0.04934918135404587, "reward_std": 0.035659849643707275, "rewards/bleu_reward_func/mean": 0.04934918135404587, "rewards/bleu_reward_func/std": 0.046043358743190765, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 365.5625, "completions/mean_terminated_length": 251.6666717529297, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.016, "grad_norm": 3.2459499835968018, "kl": 0.0007076263427734375, "learning_rate": 3.1746031746031743e-07, "loss": -0.1034, "num_tokens": 302384.0, "reward": 0.045273810625076294, "reward_std": 0.033148057758808136, "rewards/bleu_reward_func/mean": 0.045273810625076294, "rewards/bleu_reward_func/std": 0.05641715228557587, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 309.4375, "completions/mean_terminated_length": 170.84210205078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0168, "grad_norm": 3.209543228149414, "kl": 0.0006561279296875, "learning_rate": 3.333333333333333e-07, "loss": -0.0094, "num_tokens": 317406.0, "reward": 0.10972930490970612, "reward_std": 0.09467534720897675, "rewards/bleu_reward_func/mean": 0.10972930490970612, "rewards/bleu_reward_func/std": 0.1834246814250946, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 276.4375, "completions/mean_terminated_length": 169.3636474609375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.0176, "grad_norm": 6.837025165557861, "kl": 0.0008249282836914062, "learning_rate": 3.4920634920634917e-07, "loss": 0.192, "num_tokens": 331436.0, "reward": 0.08987575769424438, "reward_std": 0.03435216099023819, "rewards/bleu_reward_func/mean": 0.08987575769424438, "rewards/bleu_reward_func/std": 0.13043095171451569, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 392.46875, "completions/mean_terminated_length": 272.9375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.0184, "grad_norm": 6.737916946411133, "kl": 0.00086212158203125, "learning_rate": 3.6507936507936504e-07, "loss": -0.0441, "num_tokens": 349715.0, "reward": 0.027110569179058075, "reward_std": 0.01938316598534584, "rewards/bleu_reward_func/mean": 0.027110569179058075, "rewards/bleu_reward_func/std": 0.021934401243925095, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 274.65625, "completions/mean_terminated_length": 230.70370483398438, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0192, "grad_norm": 10.491765022277832, "kl": 0.0007648468017578125, "learning_rate": 3.809523809523809e-07, "loss": 0.269, "num_tokens": 360336.0, "reward": 0.03281049802899361, "reward_std": 0.023013217374682426, "rewards/bleu_reward_func/mean": 0.03281049802899361, "rewards/bleu_reward_func/std": 0.026025522500276566, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 103.7368392944336, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.02, "grad_norm": 5.670685291290283, "kl": 0.0008592605590820312, "learning_rate": 3.968253968253968e-07, "loss": 0.2917, "num_tokens": 374179.0, "reward": 0.04281582683324814, "reward_std": 0.0440773144364357, "rewards/bleu_reward_func/mean": 0.04281582683324814, "rewards/bleu_reward_func/std": 0.0797559842467308, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 296.96875, "completions/mean_terminated_length": 184.33334350585938, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0208, "grad_norm": 6.63213586807251, "kl": 0.0011281967163085938, "learning_rate": 4.1269841269841265e-07, "loss": 0.0991, "num_tokens": 386458.0, "reward": 0.07768785208463669, "reward_std": 0.08760131150484085, "rewards/bleu_reward_func/mean": 0.07768785208463669, "rewards/bleu_reward_func/std": 0.12583571672439575, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 280.4375, "completions/mean_terminated_length": 175.18182373046875, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.0216, "grad_norm": 5.1314802169799805, "kl": 0.0008611679077148438, "learning_rate": 4.285714285714285e-07, "loss": 0.2129, "num_tokens": 399600.0, "reward": 0.034803349524736404, "reward_std": 0.033125463873147964, "rewards/bleu_reward_func/mean": 0.034803349524736404, "rewards/bleu_reward_func/std": 0.04297792166471481, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 403.625, "completions/mean_terminated_length": 196.72727966308594, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0224, "grad_norm": 2.8215885162353516, "kl": 0.0007638931274414062, "learning_rate": 4.444444444444444e-07, "loss": -0.2953, "num_tokens": 415372.0, "reward": 0.02452818863093853, "reward_std": 0.018821807578206062, "rewards/bleu_reward_func/mean": 0.02452818863093853, "rewards/bleu_reward_func/std": 0.03300207853317261, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 213.84375, "completions/mean_terminated_length": 114.45833587646484, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0232, "grad_norm": 7.012094020843506, "kl": 0.0006189346313476562, "learning_rate": 4.6031746031746025e-07, "loss": 0.119, "num_tokens": 428351.0, "reward": 0.055403269827365875, "reward_std": 0.06412488222122192, "rewards/bleu_reward_func/mean": 0.055403269827365875, "rewards/bleu_reward_func/std": 0.07173087447881699, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 374.28125, "completions/mean_terminated_length": 291.6499938964844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.024, "grad_norm": 5.251861095428467, "kl": 0.0007734298706054688, "learning_rate": 4.761904761904761e-07, "loss": 0.0396, "num_tokens": 443856.0, "reward": 0.033150382339954376, "reward_std": 0.029685020446777344, "rewards/bleu_reward_func/mean": 0.033150382339954376, "rewards/bleu_reward_func/std": 0.04449395835399628, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 352.6875, "completions/mean_terminated_length": 212.11764526367188, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0248, "grad_norm": 2.8306992053985596, "kl": 0.0007534027099609375, "learning_rate": 4.92063492063492e-07, "loss": 0.0386, "num_tokens": 458846.0, "reward": 0.07098191231489182, "reward_std": 0.07976502180099487, "rewards/bleu_reward_func/mean": 0.07098191231489182, "rewards/bleu_reward_func/std": 0.13301755487918854, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 314.96875, "completions/mean_terminated_length": 249.2916717529297, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0256, "grad_norm": 4.18798303604126, "kl": 0.0013475418090820312, "learning_rate": 5.079365079365079e-07, "loss": 0.1184, "num_tokens": 475693.0, "reward": 0.06003670394420624, "reward_std": 0.04762943834066391, "rewards/bleu_reward_func/mean": 0.06003670394420624, "rewards/bleu_reward_func/std": 0.06799852848052979, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 246.09375, "completions/mean_terminated_length": 157.45834350585938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0264, "grad_norm": 7.522784233093262, "kl": 0.0013055801391601562, "learning_rate": 5.238095238095238e-07, "loss": 0.2574, "num_tokens": 489632.0, "reward": 0.035463202744722366, "reward_std": 0.02683849260210991, "rewards/bleu_reward_func/mean": 0.035463202744722366, "rewards/bleu_reward_func/std": 0.05300255864858627, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 281.40625, "completions/mean_terminated_length": 191.17391967773438, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.0272, "grad_norm": 4.312166213989258, "kl": 0.0016279220581054688, "learning_rate": 5.396825396825396e-07, "loss": 0.0276, "num_tokens": 503221.0, "reward": 0.036928486078977585, "reward_std": 0.030746515840291977, "rewards/bleu_reward_func/mean": 0.036928486078977585, "rewards/bleu_reward_func/std": 0.041675370186567307, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 363.0, "completions/mean_terminated_length": 231.5294189453125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.028, "grad_norm": 5.934871196746826, "kl": 0.001247406005859375, "learning_rate": 5.555555555555555e-07, "loss": 0.0007, "num_tokens": 519629.0, "reward": 0.02279968000948429, "reward_std": 0.0171576626598835, "rewards/bleu_reward_func/mean": 0.02279968000948429, "rewards/bleu_reward_func/std": 0.02809896320104599, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 227.33334350585938, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0288, "grad_norm": 6.152184963226318, "kl": 0.00135040283203125, "learning_rate": 5.714285714285714e-07, "loss": 0.1277, "num_tokens": 531009.0, "reward": 0.08614860475063324, "reward_std": 0.05592390149831772, "rewards/bleu_reward_func/mean": 0.08614860475063324, "rewards/bleu_reward_func/std": 0.07292494177818298, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 315.46875, "completions/mean_terminated_length": 226.13636779785156, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0296, "grad_norm": 4.999361991882324, "kl": 0.0010061264038085938, "learning_rate": 5.873015873015873e-07, "loss": -0.1945, "num_tokens": 553904.0, "reward": 0.022978566586971283, "reward_std": 0.0320000983774662, "rewards/bleu_reward_func/mean": 0.022978566586971283, "rewards/bleu_reward_func/std": 0.05384916067123413, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 195.84616088867188, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0304, "grad_norm": 12.462119102478027, "kl": 0.00140380859375, "learning_rate": 6.031746031746031e-07, "loss": -0.0499, "num_tokens": 569980.0, "reward": 0.06601191312074661, "reward_std": 0.06571432948112488, "rewards/bleu_reward_func/mean": 0.06601191312074661, "rewards/bleu_reward_func/std": 0.11037519574165344, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 350.78125, "completions/mean_terminated_length": 287.6956481933594, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0312, "grad_norm": 4.318572044372559, "kl": 0.0013608932495117188, "learning_rate": 6.19047619047619e-07, "loss": 0.1581, "num_tokens": 584165.0, "reward": 0.03686396777629852, "reward_std": 0.00873212143778801, "rewards/bleu_reward_func/mean": 0.03686396777629852, "rewards/bleu_reward_func/std": 0.03987700119614601, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 395.40625, "completions/mean_terminated_length": 315.631591796875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.032, "grad_norm": 2.8483028411865234, "kl": 0.0015163421630859375, "learning_rate": 6.349206349206349e-07, "loss": 0.2409, "num_tokens": 599602.0, "reward": 0.012605215422809124, "reward_std": 0.007717709057033062, "rewards/bleu_reward_func/mean": 0.012605215422809124, "rewards/bleu_reward_func/std": 0.008546828292310238, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 306.46875, "completions/mean_terminated_length": 125.11764526367188, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0328, "grad_norm": 15.546673774719238, "kl": 0.0033998489379882812, "learning_rate": 6.507936507936507e-07, "loss": 0.2262, "num_tokens": 617761.0, "reward": 0.037311654537916183, "reward_std": 0.04001215100288391, "rewards/bleu_reward_func/mean": 0.037311654537916183, "rewards/bleu_reward_func/std": 0.05116492509841919, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 308.375, "completions/mean_terminated_length": 77.60000610351562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0336, "grad_norm": 9.35763168334961, "kl": 0.0030651092529296875, "learning_rate": 6.666666666666666e-07, "loss": 0.1783, "num_tokens": 637541.0, "reward": 0.06710080057382584, "reward_std": 0.0418785884976387, "rewards/bleu_reward_func/mean": 0.06710080057382584, "rewards/bleu_reward_func/std": 0.09365852922201157, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 336.59375, "completions/mean_terminated_length": 216.57894897460938, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0344, "grad_norm": 4.267389297485352, "kl": 0.002620697021484375, "learning_rate": 6.825396825396826e-07, "loss": -0.0163, "num_tokens": 650776.0, "reward": 0.04351692646741867, "reward_std": 0.03509015589952469, "rewards/bleu_reward_func/mean": 0.04351692646741867, "rewards/bleu_reward_func/std": 0.052853576838970184, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 177.84616088867188, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0352, "grad_norm": 112.21627807617188, "kl": 0.0047245025634765625, "learning_rate": 6.984126984126983e-07, "loss": 0.0521, "num_tokens": 665576.0, "reward": 0.05281548202037811, "reward_std": 0.034495480358600616, "rewards/bleu_reward_func/mean": 0.05281548202037811, "rewards/bleu_reward_func/std": 0.0704483836889267, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 335.46875, "completions/mean_terminated_length": 310.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.036, "grad_norm": 30.739818572998047, "kl": 0.00278472900390625, "learning_rate": 7.142857142857143e-07, "loss": -0.1091, "num_tokens": 678375.0, "reward": 0.04049266129732132, "reward_std": 0.020605597645044327, "rewards/bleu_reward_func/mean": 0.04049266129732132, "rewards/bleu_reward_func/std": 0.04322003573179245, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 253.28125, "completions/mean_terminated_length": 205.37037658691406, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0368, "grad_norm": 3.857532501220703, "kl": 0.002582550048828125, "learning_rate": 7.301587301587301e-07, "loss": 0.1863, "num_tokens": 693632.0, "reward": 0.03602021187543869, "reward_std": 0.03167928382754326, "rewards/bleu_reward_func/mean": 0.03602021187543869, "rewards/bleu_reward_func/std": 0.060269005596637726, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 208.2857208251953, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.0376, "grad_norm": 44.705684661865234, "kl": 0.002960205078125, "learning_rate": 7.46031746031746e-07, "loss": -0.2537, "num_tokens": 712244.0, "reward": 0.009683560580015182, "reward_std": 0.007736856117844582, "rewards/bleu_reward_func/mean": 0.009683560580015182, "rewards/bleu_reward_func/std": 0.010262547992169857, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 209.46875, "completions/mean_terminated_length": 153.44444274902344, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0384, "grad_norm": 4.927426815032959, "kl": 0.010406494140625, "learning_rate": 7.619047619047618e-07, "loss": 0.2249, "num_tokens": 722779.0, "reward": 0.06434739381074905, "reward_std": 0.062096044421195984, "rewards/bleu_reward_func/mean": 0.06434739381074905, "rewards/bleu_reward_func/std": 0.07261113822460175, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 319.53125, "completions/mean_terminated_length": 187.84210205078125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0392, "grad_norm": 6.036177158355713, "kl": 0.0051422119140625, "learning_rate": 7.777777777777778e-07, "loss": 0.2132, "num_tokens": 735892.0, "reward": 0.0316137932240963, "reward_std": 0.028243713080883026, "rewards/bleu_reward_func/mean": 0.0316137932240963, "rewards/bleu_reward_func/std": 0.032289810478687286, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 329.0625, "completions/mean_terminated_length": 233.23809814453125, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.04, "grad_norm": 6.000904560089111, "kl": 0.0041103363037109375, "learning_rate": 7.936507936507936e-07, "loss": 0.164, "num_tokens": 748926.0, "reward": 0.031059542670845985, "reward_std": 0.02046222612261772, "rewards/bleu_reward_func/mean": 0.031059542670845985, "rewards/bleu_reward_func/std": 0.029215287417173386, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 240.03125, "completions/mean_terminated_length": 201.17857360839844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0408, "grad_norm": 4.45400333404541, "kl": 0.00446319580078125, "learning_rate": 8.095238095238095e-07, "loss": 0.095, "num_tokens": 763935.0, "reward": 0.06022896245121956, "reward_std": 0.04401791840791702, "rewards/bleu_reward_func/mean": 0.06022896245121956, "rewards/bleu_reward_func/std": 0.06288844347000122, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 169.89474487304688, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.0416, "grad_norm": 6.896392822265625, "kl": 0.00627899169921875, "learning_rate": 8.253968253968253e-07, "loss": 0.1302, "num_tokens": 781619.0, "reward": 0.02847466617822647, "reward_std": 0.024918708950281143, "rewards/bleu_reward_func/mean": 0.02847466617822647, "rewards/bleu_reward_func/std": 0.03209677338600159, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 309.9375, "completions/mean_terminated_length": 263.3077087402344, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0424, "grad_norm": 5.093250751495361, "kl": 0.004283905029296875, "learning_rate": 8.412698412698413e-07, "loss": -0.0777, "num_tokens": 795977.0, "reward": 0.07096201926469803, "reward_std": 0.06636855751276016, "rewards/bleu_reward_func/mean": 0.07096201926469803, "rewards/bleu_reward_func/std": 0.09039857983589172, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 386.4375, "completions/mean_terminated_length": 202.92308044433594, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0432, "grad_norm": 3.2264180183410645, "kl": 0.0028362274169921875, "learning_rate": 8.57142857142857e-07, "loss": -0.0863, "num_tokens": 814135.0, "reward": 0.014086933806538582, "reward_std": 0.013363949954509735, "rewards/bleu_reward_func/mean": 0.014086933806538582, "rewards/bleu_reward_func/std": 0.01598522998392582, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 244.59375, "completions/mean_terminated_length": 195.07408142089844, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.044, "grad_norm": 8.706979751586914, "kl": 0.00571441650390625, "learning_rate": 8.73015873015873e-07, "loss": 0.1609, "num_tokens": 827226.0, "reward": 0.0647934228181839, "reward_std": 0.0345802828669548, "rewards/bleu_reward_func/mean": 0.0647934228181839, "rewards/bleu_reward_func/std": 0.04030924290418625, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 126.0625, "completions/mean_terminated_length": 113.61289978027344, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0448, "grad_norm": 8.598736763000488, "kl": 0.0142974853515625, "learning_rate": 8.888888888888888e-07, "loss": 0.1419, "num_tokens": 834268.0, "reward": 0.04880748316645622, "reward_std": 0.042880259454250336, "rewards/bleu_reward_func/mean": 0.04880748316645622, "rewards/bleu_reward_func/std": 0.05060458555817604, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 302.03125, "completions/mean_terminated_length": 219.86956787109375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0456, "grad_norm": 6.926377296447754, "kl": 0.009532928466796875, "learning_rate": 9.047619047619047e-07, "loss": -0.0374, "num_tokens": 851701.0, "reward": 0.06913506239652634, "reward_std": 0.04138587415218353, "rewards/bleu_reward_func/mean": 0.06913506239652634, "rewards/bleu_reward_func/std": 0.0750163346529007, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 211.40625, "completions/mean_terminated_length": 127.23999786376953, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0464, "grad_norm": 7.853041648864746, "kl": 0.0272064208984375, "learning_rate": 9.206349206349205e-07, "loss": 0.4023, "num_tokens": 865434.0, "reward": 0.12499310076236725, "reward_std": 0.08980046212673187, "rewards/bleu_reward_func/mean": 0.12499310076236725, "rewards/bleu_reward_func/std": 0.13493874669075012, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 390.65625, "completions/mean_terminated_length": 283.5882263183594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0472, "grad_norm": 2.4230539798736572, "kl": 0.003765106201171875, "learning_rate": 9.365079365079365e-07, "loss": -0.0092, "num_tokens": 884815.0, "reward": 0.021261584013700485, "reward_std": 0.027461236342787743, "rewards/bleu_reward_func/mean": 0.021261584013700485, "rewards/bleu_reward_func/std": 0.03110821731388569, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 281.59375, "completions/mean_terminated_length": 191.43478393554688, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.048, "grad_norm": 2.9367215633392334, "kl": 0.00628662109375, "learning_rate": 9.523809523809522e-07, "loss": 0.2021, "num_tokens": 896810.0, "reward": 0.023613639175891876, "reward_std": 0.02252291887998581, "rewards/bleu_reward_func/mean": 0.023613639175891876, "rewards/bleu_reward_func/std": 0.041281431913375854, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 272.34375, "completions/mean_terminated_length": 227.9629669189453, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0488, "grad_norm": 3.998281717300415, "kl": 0.0076904296875, "learning_rate": 9.682539682539682e-07, "loss": -0.1513, "num_tokens": 907349.0, "reward": 0.07193129509687424, "reward_std": 0.05195175111293793, "rewards/bleu_reward_func/mean": 0.07193129509687424, "rewards/bleu_reward_func/std": 0.07358168065547943, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 82.25, "completions/mean_terminated_length": 82.25, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0496, "grad_norm": 12.274957656860352, "kl": 0.049835205078125, "learning_rate": 9.84126984126984e-07, "loss": 0.0211, "num_tokens": 916605.0, "reward": 0.1968570053577423, "reward_std": 0.09575757384300232, "rewards/bleu_reward_func/mean": 0.1968570053577423, "rewards/bleu_reward_func/std": 0.14971531927585602, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 209.71875, "completions/mean_terminated_length": 178.44827270507812, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0504, "grad_norm": 4.69417142868042, "kl": 0.0074615478515625, "learning_rate": 1e-06, "loss": -0.0102, "num_tokens": 924772.0, "reward": 0.026346374303102493, "reward_std": 0.015668006613850594, "rewards/bleu_reward_func/mean": 0.026346374303102493, "rewards/bleu_reward_func/std": 0.016677534207701683, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 147.34375, "completions/mean_terminated_length": 95.25000762939453, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.0512, "grad_norm": 7.854241371154785, "kl": 0.019744873046875, "learning_rate": 1e-06, "loss": 0.1402, "num_tokens": 932879.0, "reward": 0.039189111441373825, "reward_std": 0.034408073872327805, "rewards/bleu_reward_func/mean": 0.039189111441373825, "rewards/bleu_reward_func/std": 0.06643246859312057, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 217.53125, "completions/mean_terminated_length": 163.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.052, "grad_norm": 5.94617223739624, "kl": 0.012752532958984375, "learning_rate": 1e-06, "loss": 0.0629, "num_tokens": 944744.0, "reward": 0.0992283821105957, "reward_std": 0.04174066707491875, "rewards/bleu_reward_func/mean": 0.0992283821105957, "rewards/bleu_reward_func/std": 0.14538165926933289, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 360.84375, "completions/mean_terminated_length": 281.66668701171875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.0528, "grad_norm": 2.7164971828460693, "kl": 0.00780487060546875, "learning_rate": 1e-06, "loss": -0.1815, "num_tokens": 959043.0, "reward": 0.03164489567279816, "reward_std": 0.024089161306619644, "rewards/bleu_reward_func/mean": 0.03164489567279816, "rewards/bleu_reward_func/std": 0.03230883181095123, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 79.83999633789062, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.0536, "grad_norm": 8.954367637634277, "kl": 0.040679931640625, "learning_rate": 1e-06, "loss": 0.4022, "num_tokens": 970487.0, "reward": 0.1188623458147049, "reward_std": 0.06528393179178238, "rewards/bleu_reward_func/mean": 0.1188623458147049, "rewards/bleu_reward_func/std": 0.10126637667417526, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 249.78125, "completions/mean_terminated_length": 112.42857360839844, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.0544, "grad_norm": 8.929741859436035, "kl": 0.01055908203125, "learning_rate": 1e-06, "loss": -0.3676, "num_tokens": 980432.0, "reward": 0.04680415242910385, "reward_std": 0.015473801642656326, "rewards/bleu_reward_func/mean": 0.04680415242910385, "rewards/bleu_reward_func/std": 0.05666949972510338, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 200.28125, "completions/mean_terminated_length": 155.75, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.0552, "grad_norm": 19.934701919555664, "kl": 0.036396026611328125, "learning_rate": 1e-06, "loss": 0.1105, "num_tokens": 988713.0, "reward": 0.03349726274609566, "reward_std": 0.007375569082796574, "rewards/bleu_reward_func/mean": 0.03349726274609566, "rewards/bleu_reward_func/std": 0.0360921286046505, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 295.1875, "completions/mean_terminated_length": 146.84210205078125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.056, "grad_norm": 3.6616406440734863, "kl": 0.0112762451171875, "learning_rate": 1e-06, "loss": 0.081, "num_tokens": 1002319.0, "reward": 0.016106903553009033, "reward_std": 0.008415726944804192, "rewards/bleu_reward_func/mean": 0.016106903553009033, "rewards/bleu_reward_func/std": 0.012413726188242435, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 235.6428680419922, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.0568, "grad_norm": 11.310477256774902, "kl": 0.02431488037109375, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 1014117.0, "reward": 0.09336908906698227, "reward_std": 0.04001408815383911, "rewards/bleu_reward_func/mean": 0.09336908906698227, "rewards/bleu_reward_func/std": 0.04507448151707649, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 298.96875, "completions/mean_terminated_length": 187.38095092773438, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0576, "grad_norm": 11.831945419311523, "kl": 0.0236358642578125, "learning_rate": 1e-06, "loss": 0.2398, "num_tokens": 1029324.0, "reward": 0.06671467423439026, "reward_std": 0.07224421948194504, "rewards/bleu_reward_func/mean": 0.06671467423439026, "rewards/bleu_reward_func/std": 0.09839192777872086, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 262.28125, "completions/mean_terminated_length": 245.6333465576172, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.0584, "grad_norm": 3.4266369342803955, "kl": 0.01332855224609375, "learning_rate": 1e-06, "loss": -0.117, "num_tokens": 1040677.0, "reward": 0.048909105360507965, "reward_std": 0.01749919354915619, "rewards/bleu_reward_func/mean": 0.048909105360507965, "rewards/bleu_reward_func/std": 0.046220190823078156, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 315.78125, "completions/mean_terminated_length": 213.0, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.0592, "grad_norm": 3.334998369216919, "kl": 0.030864715576171875, "learning_rate": 1e-06, "loss": 0.0762, "num_tokens": 1057350.0, "reward": 0.06654933840036392, "reward_std": 0.030867960304021835, "rewards/bleu_reward_func/mean": 0.06654933840036392, "rewards/bleu_reward_func/std": 0.04364337399601936, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 287.4375, "completions/mean_terminated_length": 235.61538696289062, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.06, "grad_norm": 12.321810722351074, "kl": 0.05252838134765625, "learning_rate": 1e-06, "loss": 0.1111, "num_tokens": 1072668.0, "reward": 0.07815341651439667, "reward_std": 0.05233295261859894, "rewards/bleu_reward_func/mean": 0.07815341651439667, "rewards/bleu_reward_func/std": 0.0646696388721466, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 306.3125, "completions/mean_terminated_length": 182.90000915527344, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.0608, "grad_norm": 3.883251905441284, "kl": 0.0373382568359375, "learning_rate": 1e-06, "loss": 0.1517, "num_tokens": 1086694.0, "reward": 0.06417744606733322, "reward_std": 0.034075379371643066, "rewards/bleu_reward_func/mean": 0.06417744606733322, "rewards/bleu_reward_func/std": 0.049788232892751694, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 196.4375, "completions/mean_terminated_length": 196.4375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.0616, "grad_norm": 6.460638523101807, "kl": 0.05291748046875, "learning_rate": 1e-06, "loss": -0.0849, "num_tokens": 1097476.0, "reward": 0.08122064173221588, "reward_std": 0.03298315033316612, "rewards/bleu_reward_func/mean": 0.08122064173221588, "rewards/bleu_reward_func/std": 0.047924816608428955, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 199.15625, "completions/mean_terminated_length": 166.79310607910156, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.0624, "grad_norm": 6.63805627822876, "kl": 0.07642364501953125, "learning_rate": 1e-06, "loss": -0.0967, "num_tokens": 1108793.0, "reward": 0.07887591421604156, "reward_std": 0.05435461550951004, "rewards/bleu_reward_func/mean": 0.07887591421604156, "rewards/bleu_reward_func/std": 0.10201766341924667, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 147.42857360839844, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.0632, "grad_norm": 14.087907791137695, "kl": 0.0762939453125, "learning_rate": 1e-06, "loss": -0.056, "num_tokens": 1118721.0, "reward": 0.035933416336774826, "reward_std": 0.02187356725335121, "rewards/bleu_reward_func/mean": 0.035933416336774826, "rewards/bleu_reward_func/std": 0.025764403864741325, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 192.9375, "completions/mean_terminated_length": 133.8518524169922, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.064, "grad_norm": 6.712767124176025, "kl": 0.07209014892578125, "learning_rate": 1e-06, "loss": 0.2799, "num_tokens": 1130095.0, "reward": 0.04000134766101837, "reward_std": 0.014790613204240799, "rewards/bleu_reward_func/mean": 0.04000134766101837, "rewards/bleu_reward_func/std": 0.028310615569353104, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 395.5625, "completions/mean_terminated_length": 315.8947448730469, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.0648, "grad_norm": 3.1348772048950195, "kl": 0.012363433837890625, "learning_rate": 1e-06, "loss": -0.0477, "num_tokens": 1145009.0, "reward": 0.05394501984119415, "reward_std": 0.019456665962934494, "rewards/bleu_reward_func/mean": 0.05394501984119415, "rewards/bleu_reward_func/std": 0.05528007075190544, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 297.25, "completions/mean_terminated_length": 213.21739196777344, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0656, "grad_norm": 4.045035362243652, "kl": 0.02558135986328125, "learning_rate": 1e-06, "loss": -0.0948, "num_tokens": 1156665.0, "reward": 0.08088956773281097, "reward_std": 0.031020794063806534, "rewards/bleu_reward_func/mean": 0.08088956773281097, "rewards/bleu_reward_func/std": 0.04719265177845955, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 252.65625, "completions/mean_terminated_length": 204.629638671875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.0664, "grad_norm": 7.020449161529541, "kl": 0.022491455078125, "learning_rate": 1e-06, "loss": -0.2084, "num_tokens": 1170686.0, "reward": 0.048978567123413086, "reward_std": 0.014538805931806564, "rewards/bleu_reward_func/mean": 0.048978567123413086, "rewards/bleu_reward_func/std": 0.03447263315320015, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 279.53125, "completions/mean_terminated_length": 157.76190185546875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0672, "grad_norm": 6.09721040725708, "kl": 0.02556610107421875, "learning_rate": 1e-06, "loss": 0.0979, "num_tokens": 1181927.0, "reward": 0.07267215847969055, "reward_std": 0.029872559010982513, "rewards/bleu_reward_func/mean": 0.07267215847969055, "rewards/bleu_reward_func/std": 0.05035723000764847, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 281.1875, "completions/mean_terminated_length": 257.3103332519531, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.068, "grad_norm": 5.706907272338867, "kl": 0.030059814453125, "learning_rate": 1e-06, "loss": -0.0967, "num_tokens": 1198797.0, "reward": 0.05050581321120262, "reward_std": 0.023779014125466347, "rewards/bleu_reward_func/mean": 0.05050581321120262, "rewards/bleu_reward_func/std": 0.03608938306570053, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 163.65625, "completions/mean_terminated_length": 163.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.0688, "grad_norm": 7.1789960861206055, "kl": 0.0573577880859375, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 1207106.0, "reward": 0.07873363792896271, "reward_std": 0.0395892933011055, "rewards/bleu_reward_func/mean": 0.07873363792896271, "rewards/bleu_reward_func/std": 0.0705900639295578, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 304.6875, "completions/mean_terminated_length": 223.56521606445312, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.0696, "grad_norm": 3.9068655967712402, "kl": 0.01955413818359375, "learning_rate": 1e-06, "loss": -0.2361, "num_tokens": 1219760.0, "reward": 0.03426438570022583, "reward_std": 0.021733341738581657, "rewards/bleu_reward_func/mean": 0.03426438570022583, "rewards/bleu_reward_func/std": 0.031944356858730316, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 137.1875, "completions/mean_terminated_length": 137.1875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0704, "grad_norm": 7.929437160491943, "kl": 0.1163482666015625, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 1230702.0, "reward": 0.13437795639038086, "reward_std": 0.04989761859178543, "rewards/bleu_reward_func/mean": 0.13437795639038086, "rewards/bleu_reward_func/std": 0.08757011592388153, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 248.1875, "completions/mean_terminated_length": 160.25, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0712, "grad_norm": 3.5676372051239014, "kl": 0.029571533203125, "learning_rate": 1e-06, "loss": 0.085, "num_tokens": 1241748.0, "reward": 0.06261839717626572, "reward_std": 0.05303023010492325, "rewards/bleu_reward_func/mean": 0.06261839717626572, "rewards/bleu_reward_func/std": 0.07371754199266434, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 199.21875, "completions/mean_terminated_length": 178.36666870117188, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.072, "grad_norm": 13.081062316894531, "kl": 0.0998382568359375, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 1254971.0, "reward": 0.09151400625705719, "reward_std": 0.049102533608675, "rewards/bleu_reward_func/mean": 0.09151400625705719, "rewards/bleu_reward_func/std": 0.08098553121089935, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 277.8125, "completions/mean_terminated_length": 244.35714721679688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0728, "grad_norm": 4.541591167449951, "kl": 0.0150604248046875, "learning_rate": 1e-06, "loss": -0.2243, "num_tokens": 1268229.0, "reward": 0.029024727642536163, "reward_std": 0.02233259379863739, "rewards/bleu_reward_func/mean": 0.029024727642536163, "rewards/bleu_reward_func/std": 0.0296621173620224, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 81.75999450683594, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0736, "grad_norm": 12.45702075958252, "kl": 0.09152984619140625, "learning_rate": 1e-06, "loss": 0.3301, "num_tokens": 1279753.0, "reward": 0.06008782982826233, "reward_std": 0.03770461678504944, "rewards/bleu_reward_func/mean": 0.06008782982826233, "rewards/bleu_reward_func/std": 0.056894708424806595, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 71.03125, "completions/mean_terminated_length": 71.03125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0744, "grad_norm": 8.271183967590332, "kl": 0.0682220458984375, "learning_rate": 1e-06, "loss": 0.2167, "num_tokens": 1288762.0, "reward": 0.17779187858104706, "reward_std": 0.02900426834821701, "rewards/bleu_reward_func/mean": 0.17779187858104706, "rewards/bleu_reward_func/std": 0.1678331196308136, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 132.28125, "completions/mean_terminated_length": 78.03572082519531, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.0752, "grad_norm": 45.396934509277344, "kl": 0.140625, "learning_rate": 1e-06, "loss": 0.1526, "num_tokens": 1299835.0, "reward": 0.1527654230594635, "reward_std": 0.061802513897418976, "rewards/bleu_reward_func/mean": 0.1527654230594635, "rewards/bleu_reward_func/std": 0.10723396390676498, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 350.8125, "completions/mean_terminated_length": 266.3809509277344, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.076, "grad_norm": 4.4763383865356445, "kl": 0.03443145751953125, "learning_rate": 1e-06, "loss": 0.0315, "num_tokens": 1314877.0, "reward": 0.08366496115922928, "reward_std": 0.023002739995718002, "rewards/bleu_reward_func/mean": 0.08366496115922928, "rewards/bleu_reward_func/std": 0.07334847003221512, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 209.37930297851562, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.0768, "grad_norm": 13.181612968444824, "kl": 0.0660247802734375, "learning_rate": 1e-06, "loss": -0.1104, "num_tokens": 1326757.0, "reward": 0.04618287831544876, "reward_std": 0.022957133129239082, "rewards/bleu_reward_func/mean": 0.04618287831544876, "rewards/bleu_reward_func/std": 0.03049774467945099, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 326.71875, "completions/mean_terminated_length": 254.21739196777344, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.0776, "grad_norm": 5.014129161834717, "kl": 0.022705078125, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 1340716.0, "reward": 0.08603382110595703, "reward_std": 0.022703565657138824, "rewards/bleu_reward_func/mean": 0.08603382110595703, "rewards/bleu_reward_func/std": 0.09760169684886932, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 205.03125, "completions/mean_terminated_length": 195.1290283203125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.0784, "grad_norm": 10.452898025512695, "kl": 0.101318359375, "learning_rate": 1e-06, "loss": -0.0956, "num_tokens": 1354373.0, "reward": 0.07816646993160248, "reward_std": 0.03450850397348404, "rewards/bleu_reward_func/mean": 0.07816646993160248, "rewards/bleu_reward_func/std": 0.05475042015314102, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 240.4375, "completions/mean_terminated_length": 190.1481475830078, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0792, "grad_norm": 8.82993221282959, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": -0.1898, "num_tokens": 1365875.0, "reward": 0.027829378843307495, "reward_std": 0.016982190310955048, "rewards/bleu_reward_func/mean": 0.027829378843307495, "rewards/bleu_reward_func/std": 0.019511230289936066, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 210.21875, "completions/mean_terminated_length": 140.57693481445312, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.08, "grad_norm": 14.261658668518066, "kl": 0.1817169189453125, "learning_rate": 1e-06, "loss": -0.4193, "num_tokens": 1375586.0, "reward": 0.0430663600564003, "reward_std": 0.023313239216804504, "rewards/bleu_reward_func/mean": 0.0430663600564003, "rewards/bleu_reward_func/std": 0.0409073531627655, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 210.90625, "completions/mean_terminated_length": 201.19354248046875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.0808, "grad_norm": 7.960334300994873, "kl": 0.0833740234375, "learning_rate": 1e-06, "loss": 0.1121, "num_tokens": 1384975.0, "reward": 0.0974574014544487, "reward_std": 0.03397291898727417, "rewards/bleu_reward_func/mean": 0.0974574014544487, "rewards/bleu_reward_func/std": 0.10795393586158752, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 306.21875, "completions/mean_terminated_length": 225.69566345214844, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.0816, "grad_norm": 3.8322501182556152, "kl": 0.0701446533203125, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 1403126.0, "reward": 0.07732782512903214, "reward_std": 0.038768649101257324, "rewards/bleu_reward_func/mean": 0.07732782512903214, "rewards/bleu_reward_func/std": 0.06468553096055984, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 167.8125, "completions/mean_terminated_length": 144.86666870117188, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.0824, "grad_norm": 6.311352729797363, "kl": 0.06109619140625, "learning_rate": 1e-06, "loss": 0.1029, "num_tokens": 1417360.0, "reward": 0.23947298526763916, "reward_std": 0.10021178424358368, "rewards/bleu_reward_func/mean": 0.23947298526763916, "rewards/bleu_reward_func/std": 0.40957576036453247, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 319.375, "completions/mean_terminated_length": 126.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0832, "grad_norm": 6.871039867401123, "kl": 0.06162261962890625, "learning_rate": 1e-06, "loss": 0.3071, "num_tokens": 1431932.0, "reward": 0.11237628757953644, "reward_std": 0.05608592554926872, "rewards/bleu_reward_func/mean": 0.11237628757953644, "rewards/bleu_reward_func/std": 0.1758151650428772, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 206.34375, "completions/mean_terminated_length": 104.45833587646484, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.084, "grad_norm": 13.681912422180176, "kl": 0.063812255859375, "learning_rate": 1e-06, "loss": 0.3711, "num_tokens": 1440791.0, "reward": 0.13408097624778748, "reward_std": 0.07736363261938095, "rewards/bleu_reward_func/mean": 0.13408097624778748, "rewards/bleu_reward_func/std": 0.10995227843523026, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 239.21875, "completions/mean_terminated_length": 188.70370483398438, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.0848, "grad_norm": 6.377567291259766, "kl": 0.212432861328125, "learning_rate": 1e-06, "loss": 0.1742, "num_tokens": 1452430.0, "reward": 0.09214982390403748, "reward_std": 0.037541188299655914, "rewards/bleu_reward_func/mean": 0.09214982390403748, "rewards/bleu_reward_func/std": 0.06507368385791779, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 382.84375, "completions/mean_terminated_length": 236.4666748046875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.0856, "grad_norm": 2.956113338470459, "kl": 0.015625, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 1470353.0, "reward": 0.029356852173805237, "reward_std": 0.020268836989998817, "rewards/bleu_reward_func/mean": 0.029356852173805237, "rewards/bleu_reward_func/std": 0.031047984957695007, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 346.5, "completions/mean_terminated_length": 247.1999969482422, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.0864, "grad_norm": 5.237264156341553, "kl": 0.0384063720703125, "learning_rate": 1e-06, "loss": -0.2289, "num_tokens": 1483353.0, "reward": 0.06388352811336517, "reward_std": 0.03146419674158096, "rewards/bleu_reward_func/mean": 0.06388352811336517, "rewards/bleu_reward_func/std": 0.0666789561510086, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 326.1875, "completions/mean_terminated_length": 199.05262756347656, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0872, "grad_norm": 5.559842109680176, "kl": 0.03765106201171875, "learning_rate": 1e-06, "loss": -0.0076, "num_tokens": 1497415.0, "reward": 0.2991971969604492, "reward_std": 0.10907518863677979, "rewards/bleu_reward_func/mean": 0.2991971969604492, "rewards/bleu_reward_func/std": 0.36222296953201294, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 98.53125, "completions/mean_terminated_length": 98.53125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.088, "grad_norm": 8.213761329650879, "kl": 0.078704833984375, "learning_rate": 1e-06, "loss": 0.1858, "num_tokens": 1504528.0, "reward": 0.048464857041835785, "reward_std": 0.0210396908223629, "rewards/bleu_reward_func/mean": 0.048464857041835785, "rewards/bleu_reward_func/std": 0.03311728686094284, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 118.59375, "completions/mean_terminated_length": 118.59375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.0888, "grad_norm": 7.166593074798584, "kl": 0.096771240234375, "learning_rate": 1e-06, "loss": -0.0566, "num_tokens": 1517035.0, "reward": 0.09873979538679123, "reward_std": 0.03707325458526611, "rewards/bleu_reward_func/mean": 0.09873979538679123, "rewards/bleu_reward_func/std": 0.13200855255126953, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 401.5, "completions/mean_terminated_length": 240.00001525878906, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.0896, "grad_norm": 2.8505043983459473, "kl": 0.02176666259765625, "learning_rate": 1e-06, "loss": -0.0153, "num_tokens": 1534363.0, "reward": 0.11044108867645264, "reward_std": 0.03410620242357254, "rewards/bleu_reward_func/mean": 0.11044108867645264, "rewards/bleu_reward_func/std": 0.16289857029914856, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 125.96875, "completions/mean_terminated_length": 125.96875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0904, "grad_norm": 7.475080490112305, "kl": 0.100982666015625, "learning_rate": 1e-06, "loss": -0.0271, "num_tokens": 1546258.0, "reward": 0.12119434028863907, "reward_std": 0.03986787050962448, "rewards/bleu_reward_func/mean": 0.12119434028863907, "rewards/bleu_reward_func/std": 0.10625314712524414, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 332.09375, "completions/mean_terminated_length": 250.3181915283203, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.0912, "grad_norm": 3.1941514015197754, "kl": 0.01666259765625, "learning_rate": 1e-06, "loss": -0.0363, "num_tokens": 1559133.0, "reward": 0.05715271458029747, "reward_std": 0.04336331784725189, "rewards/bleu_reward_func/mean": 0.05715271458029747, "rewards/bleu_reward_func/std": 0.05400845408439636, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 182.03125, "completions/mean_terminated_length": 160.03334045410156, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.092, "grad_norm": 8.439948081970215, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": -0.2392, "num_tokens": 1569774.0, "reward": 0.0502852126955986, "reward_std": 0.01610748842358589, "rewards/bleu_reward_func/mean": 0.0502852126955986, "rewards/bleu_reward_func/std": 0.040807489305734634, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 163.84375, "completions/mean_terminated_length": 163.84375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.0928, "grad_norm": 4.541551113128662, "kl": 0.069915771484375, "learning_rate": 1e-06, "loss": -0.0109, "num_tokens": 1580281.0, "reward": 0.03980318829417229, "reward_std": 0.01563824526965618, "rewards/bleu_reward_func/mean": 0.03980318829417229, "rewards/bleu_reward_func/std": 0.023048467934131622, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 226.39999389648438, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.0936, "grad_norm": 8.210314750671387, "kl": 0.09372711181640625, "learning_rate": 1e-06, "loss": 0.122, "num_tokens": 1593285.0, "reward": 0.0629456490278244, "reward_std": 0.015063179656863213, "rewards/bleu_reward_func/mean": 0.0629456490278244, "rewards/bleu_reward_func/std": 0.03602227941155434, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 350.0, "completions/mean_terminated_length": 265.1428527832031, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.0944, "grad_norm": 2.992391586303711, "kl": 0.02304840087890625, "learning_rate": 1e-06, "loss": -0.1711, "num_tokens": 1606645.0, "reward": 0.022465957328677177, "reward_std": 0.016872048377990723, "rewards/bleu_reward_func/mean": 0.022465957328677177, "rewards/bleu_reward_func/std": 0.023790787905454636, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 233.07693481445312, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.0952, "grad_norm": 4.377739429473877, "kl": 0.0274658203125, "learning_rate": 1e-06, "loss": -0.0351, "num_tokens": 1617737.0, "reward": 0.053562991321086884, "reward_std": 0.025934984907507896, "rewards/bleu_reward_func/mean": 0.053562991321086884, "rewards/bleu_reward_func/std": 0.03459456190466881, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 85.17646789550781, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.096, "grad_norm": 5.66406774520874, "kl": 0.032745361328125, "learning_rate": 1e-06, "loss": 0.1042, "num_tokens": 1631409.0, "reward": 0.05639251321554184, "reward_std": 0.025049947202205658, "rewards/bleu_reward_func/mean": 0.05639251321554184, "rewards/bleu_reward_func/std": 0.04031047970056534, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 391.25, "completions/mean_terminated_length": 254.40000915527344, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.0968, "grad_norm": 2.7011678218841553, "kl": 0.019012451171875, "learning_rate": 1e-06, "loss": -0.0595, "num_tokens": 1647665.0, "reward": 0.1355845332145691, "reward_std": 0.03834523260593414, "rewards/bleu_reward_func/mean": 0.1355845332145691, "rewards/bleu_reward_func/std": 0.17731845378875732, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 360.09375, "completions/mean_terminated_length": 241.94444274902344, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.0976, "grad_norm": 3.2975172996520996, "kl": 0.0248870849609375, "learning_rate": 1e-06, "loss": -0.0281, "num_tokens": 1661164.0, "reward": 0.02182621881365776, "reward_std": 0.010437489487230778, "rewards/bleu_reward_func/mean": 0.02182621881365776, "rewards/bleu_reward_func/std": 0.019065655767917633, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 329.03125, "completions/mean_terminated_length": 295.1481628417969, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.0984, "grad_norm": 3.011232376098633, "kl": 0.023040771484375, "learning_rate": 1e-06, "loss": -0.0609, "num_tokens": 1674269.0, "reward": 0.05195554345846176, "reward_std": 0.020864665508270264, "rewards/bleu_reward_func/mean": 0.05195554345846176, "rewards/bleu_reward_func/std": 0.027087198570370674, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 301.0, "completions/mean_terminated_length": 114.82353210449219, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.0992, "grad_norm": 8.828636169433594, "kl": 0.0798492431640625, "learning_rate": 1e-06, "loss": -0.0267, "num_tokens": 1691797.0, "reward": 0.1420682966709137, "reward_std": 0.04143287241458893, "rewards/bleu_reward_func/mean": 0.1420682966709137, "rewards/bleu_reward_func/std": 0.07349839806556702, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 167.28125, "completions/mean_terminated_length": 167.28125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1, "grad_norm": 7.268754482269287, "kl": 0.104766845703125, "learning_rate": 1e-06, "loss": 0.2657, "num_tokens": 1702150.0, "reward": 0.16663971543312073, "reward_std": 0.05392443761229515, "rewards/bleu_reward_func/mean": 0.16663971543312073, "rewards/bleu_reward_func/std": 0.09980462491512299, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 367.8125, "completions/mean_terminated_length": 223.625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.1008, "grad_norm": 2.9197561740875244, "kl": 0.0296630859375, "learning_rate": 1e-06, "loss": 0.069, "num_tokens": 1720080.0, "reward": 0.05814104527235031, "reward_std": 0.023808015510439873, "rewards/bleu_reward_func/mean": 0.05814104527235031, "rewards/bleu_reward_func/std": 0.06258071959018707, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 311.03125, "completions/mean_terminated_length": 110.0625, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.1016, "grad_norm": 3.6131699085235596, "kl": 0.040496826171875, "learning_rate": 1e-06, "loss": -0.1103, "num_tokens": 1736129.0, "reward": 0.1627029925584793, "reward_std": 0.048266101628541946, "rewards/bleu_reward_func/mean": 0.1627029925584793, "rewards/bleu_reward_func/std": 0.2640880048274994, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 182.1875, "completions/mean_terminated_length": 89.83999633789062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1024, "grad_norm": 5.8553900718688965, "kl": 0.0695953369140625, "learning_rate": 1e-06, "loss": 0.2073, "num_tokens": 1744807.0, "reward": 0.05680542066693306, "reward_std": 0.02900797501206398, "rewards/bleu_reward_func/mean": 0.05680542066693306, "rewards/bleu_reward_func/std": 0.062428779900074005, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 235.1666717529297, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.1032, "grad_norm": 2.625256299972534, "kl": 0.014190673828125, "learning_rate": 1e-06, "loss": -0.3256, "num_tokens": 1759979.0, "reward": 0.07073010504245758, "reward_std": 0.0585593655705452, "rewards/bleu_reward_func/mean": 0.07073010504245758, "rewards/bleu_reward_func/std": 0.0830271914601326, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 269.21875, "completions/mean_terminated_length": 188.2916717529297, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.104, "grad_norm": 7.594178199768066, "kl": 0.019775390625, "learning_rate": 1e-06, "loss": 0.103, "num_tokens": 1772810.0, "reward": 0.03343900665640831, "reward_std": 0.008691318333148956, "rewards/bleu_reward_func/mean": 0.03343900665640831, "rewards/bleu_reward_func/std": 0.027092551812529564, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 307.375, "completions/mean_terminated_length": 286.2069091796875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.1048, "grad_norm": 3.5237069129943848, "kl": 0.0251312255859375, "learning_rate": 1e-06, "loss": 0.0902, "num_tokens": 1786094.0, "reward": 0.03853389620780945, "reward_std": 0.016378795728087425, "rewards/bleu_reward_func/mean": 0.03853389620780945, "rewards/bleu_reward_func/std": 0.02983209490776062, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 244.03125, "completions/mean_terminated_length": 194.40740966796875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.1056, "grad_norm": 3.428116798400879, "kl": 0.072906494140625, "learning_rate": 1e-06, "loss": 0.1144, "num_tokens": 1797551.0, "reward": 0.1538739800453186, "reward_std": 0.03595956414937973, "rewards/bleu_reward_func/mean": 0.1538739800453186, "rewards/bleu_reward_func/std": 0.21548843383789062, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 220.46875, "completions/mean_terminated_length": 211.06451416015625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.1064, "grad_norm": 4.208179950714111, "kl": 0.058685302734375, "learning_rate": 1e-06, "loss": -0.1489, "num_tokens": 1808726.0, "reward": 0.18491268157958984, "reward_std": 0.0416969433426857, "rewards/bleu_reward_func/mean": 0.18491268157958984, "rewards/bleu_reward_func/std": 0.2198871225118637, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 117.5625, "completions/mean_terminated_length": 117.5625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1072, "grad_norm": 9.312064170837402, "kl": 0.2574310302734375, "learning_rate": 1e-06, "loss": -0.1699, "num_tokens": 1816152.0, "reward": 0.09744147956371307, "reward_std": 0.03963543474674225, "rewards/bleu_reward_func/mean": 0.09744147956371307, "rewards/bleu_reward_func/std": 0.07821591198444366, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 284.5625, "completions/mean_terminated_length": 165.42857360839844, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.108, "grad_norm": 2.491009473800659, "kl": 0.5361785888671875, "learning_rate": 1e-06, "loss": 0.0755, "num_tokens": 1830010.0, "reward": 0.07847163081169128, "reward_std": 0.07447989284992218, "rewards/bleu_reward_func/mean": 0.07847163081169128, "rewards/bleu_reward_func/std": 0.1269197165966034, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 208.28125, "completions/mean_terminated_length": 164.8928680419922, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1088, "grad_norm": 9.84536075592041, "kl": 0.137481689453125, "learning_rate": 1e-06, "loss": 0.0643, "num_tokens": 1843019.0, "reward": 0.23385955393314362, "reward_std": 0.07621090114116669, "rewards/bleu_reward_func/mean": 0.23385955393314362, "rewards/bleu_reward_func/std": 0.2127569168806076, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 243.46875, "completions/mean_terminated_length": 138.3913116455078, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.1096, "grad_norm": 6.7033257484436035, "kl": 0.051727294921875, "learning_rate": 1e-06, "loss": 0.1285, "num_tokens": 1856450.0, "reward": 0.04753299057483673, "reward_std": 0.016634728759527206, "rewards/bleu_reward_func/mean": 0.04753299057483673, "rewards/bleu_reward_func/std": 0.030512619763612747, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 352.40625, "completions/mean_terminated_length": 171.53334045410156, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1104, "grad_norm": 3.812756061553955, "kl": 0.0177764892578125, "learning_rate": 1e-06, "loss": 0.1601, "num_tokens": 1870943.0, "reward": 0.04067971557378769, "reward_std": 0.026344479992985725, "rewards/bleu_reward_func/mean": 0.04067971557378769, "rewards/bleu_reward_func/std": 0.06328170746564865, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 287.46875, "completions/mean_terminated_length": 264.2413635253906, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1112, "grad_norm": 4.916464805603027, "kl": 0.06890869140625, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 1883774.0, "reward": 0.1910865753889084, "reward_std": 0.09566200524568558, "rewards/bleu_reward_func/mean": 0.1910865753889084, "rewards/bleu_reward_func/std": 0.2485995888710022, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 312.6875, "completions/mean_terminated_length": 157.6666717529297, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.112, "grad_norm": 5.02897310256958, "kl": 0.081298828125, "learning_rate": 1e-06, "loss": 0.0347, "num_tokens": 1900964.0, "reward": 0.1273106336593628, "reward_std": 0.037408363074064255, "rewards/bleu_reward_func/mean": 0.1273106336593628, "rewards/bleu_reward_func/std": 0.1255699247121811, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 287.40625, "completions/mean_terminated_length": 199.52174377441406, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.1128, "grad_norm": 3.5728607177734375, "kl": 0.0290985107421875, "learning_rate": 1e-06, "loss": -0.0802, "num_tokens": 1914153.0, "reward": 0.1449739634990692, "reward_std": 0.05561315268278122, "rewards/bleu_reward_func/mean": 0.1449739634990692, "rewards/bleu_reward_func/std": 0.10589203238487244, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 171.6875, "completions/mean_terminated_length": 149.00001525878906, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1136, "grad_norm": 6.168550491333008, "kl": 0.076751708984375, "learning_rate": 1e-06, "loss": 0.1377, "num_tokens": 1924551.0, "reward": 0.07935678958892822, "reward_std": 0.044586654752492905, "rewards/bleu_reward_func/mean": 0.07935678958892822, "rewards/bleu_reward_func/std": 0.11080160737037659, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 188.28125, "completions/mean_terminated_length": 166.70001220703125, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1144, "grad_norm": 5.237224102020264, "kl": 0.042633056640625, "learning_rate": 1e-06, "loss": 0.1969, "num_tokens": 1936192.0, "reward": 0.07339954376220703, "reward_std": 0.04980514198541641, "rewards/bleu_reward_func/mean": 0.07339954376220703, "rewards/bleu_reward_func/std": 0.06703697144985199, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 168.03125, "completions/mean_terminated_length": 53.375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1152, "grad_norm": 6.578721523284912, "kl": 0.148895263671875, "learning_rate": 1e-06, "loss": 0.2196, "num_tokens": 1946361.0, "reward": 0.2388084977865219, "reward_std": 0.05400132015347481, "rewards/bleu_reward_func/mean": 0.2388084977865219, "rewards/bleu_reward_func/std": 0.2556310296058655, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 254.90625, "completions/mean_terminated_length": 218.17857360839844, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.116, "grad_norm": 5.644160270690918, "kl": 0.10284423828125, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 1958990.0, "reward": 0.05691784247756004, "reward_std": 0.045338764786720276, "rewards/bleu_reward_func/mean": 0.05691784247756004, "rewards/bleu_reward_func/std": 0.051530975848436356, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 126.06451416015625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1168, "grad_norm": 6.659230709075928, "kl": 0.07525634765625, "learning_rate": 1e-06, "loss": 0.3905, "num_tokens": 1966178.0, "reward": 0.08115407824516296, "reward_std": 0.05008203536272049, "rewards/bleu_reward_func/mean": 0.08115407824516296, "rewards/bleu_reward_func/std": 0.060907039791345596, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 69.23077392578125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1176, "grad_norm": 10.029218673706055, "kl": 0.1614227294921875, "learning_rate": 1e-06, "loss": 0.0338, "num_tokens": 1977194.0, "reward": 0.23646463453769684, "reward_std": 0.09375543892383575, "rewards/bleu_reward_func/mean": 0.23646463453769684, "rewards/bleu_reward_func/std": 0.27427393198013306, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 300.84375, "completions/mean_terminated_length": 174.15000915527344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.1184, "grad_norm": 7.6274027824401855, "kl": 0.0694122314453125, "learning_rate": 1e-06, "loss": -0.0195, "num_tokens": 1991693.0, "reward": 0.1271597295999527, "reward_std": 0.03925805538892746, "rewards/bleu_reward_func/mean": 0.1271597295999527, "rewards/bleu_reward_func/std": 0.20968182384967804, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 235.0625, "completions/mean_terminated_length": 142.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1192, "grad_norm": 5.1624908447265625, "kl": 0.074005126953125, "learning_rate": 1e-06, "loss": -0.0749, "num_tokens": 2002359.0, "reward": 0.0867965817451477, "reward_std": 0.03743039071559906, "rewards/bleu_reward_func/mean": 0.0867965817451477, "rewards/bleu_reward_func/std": 0.06982331722974777, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 180.125, "completions/mean_terminated_length": 69.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.12, "grad_norm": 5.687462329864502, "kl": 0.079742431640625, "learning_rate": 1e-06, "loss": 0.1774, "num_tokens": 2012139.0, "reward": 0.08913667500019073, "reward_std": 0.03803376108407974, "rewards/bleu_reward_func/mean": 0.08913667500019073, "rewards/bleu_reward_func/std": 0.07373686879873276, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 246.6875, "completions/mean_terminated_length": 158.25, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.1208, "grad_norm": 6.3145341873168945, "kl": 0.168792724609375, "learning_rate": 1e-06, "loss": 0.0474, "num_tokens": 2024961.0, "reward": 0.09886027127504349, "reward_std": 0.09059572219848633, "rewards/bleu_reward_func/mean": 0.09886027127504349, "rewards/bleu_reward_func/std": 0.20261086523532867, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 278.375, "completions/mean_terminated_length": 200.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.1216, "grad_norm": 6.056458473205566, "kl": 0.04815673828125, "learning_rate": 1e-06, "loss": 0.0367, "num_tokens": 2038621.0, "reward": 0.040941424667835236, "reward_std": 0.024181999266147614, "rewards/bleu_reward_func/mean": 0.040941424667835236, "rewards/bleu_reward_func/std": 0.031022800132632256, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 174.4375, "completions/mean_terminated_length": 79.91999816894531, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1224, "grad_norm": 11.584298133850098, "kl": 0.10552978515625, "learning_rate": 1e-06, "loss": -0.3199, "num_tokens": 2048947.0, "reward": 0.057592377066612244, "reward_std": 0.02831832319498062, "rewards/bleu_reward_func/mean": 0.057592377066612244, "rewards/bleu_reward_func/std": 0.0929059162735939, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 285.71875, "completions/mean_terminated_length": 197.17391967773438, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.1232, "grad_norm": 3.8603882789611816, "kl": 0.042999267578125, "learning_rate": 1e-06, "loss": -0.1903, "num_tokens": 2059978.0, "reward": 0.0238445196300745, "reward_std": 0.016163241118192673, "rewards/bleu_reward_func/mean": 0.0238445196300745, "rewards/bleu_reward_func/std": 0.020820245146751404, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 110.90625, "completions/mean_terminated_length": 110.90625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.124, "grad_norm": 9.231768608093262, "kl": 0.134063720703125, "learning_rate": 1e-06, "loss": -0.12, "num_tokens": 2068223.0, "reward": 0.093255415558815, "reward_std": 0.04695024713873863, "rewards/bleu_reward_func/mean": 0.093255415558815, "rewards/bleu_reward_func/std": 0.07957140356302261, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 151.85714721679688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1248, "grad_norm": 6.661685466766357, "kl": 0.11859130859375, "learning_rate": 1e-06, "loss": -0.1122, "num_tokens": 2079491.0, "reward": 0.10441941022872925, "reward_std": 0.06782116740942001, "rewards/bleu_reward_func/mean": 0.10441941022872925, "rewards/bleu_reward_func/std": 0.1558544933795929, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 417.84375, "completions/mean_terminated_length": 353.4210510253906, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.1256, "grad_norm": 2.281557559967041, "kl": 0.020050048828125, "learning_rate": 1e-06, "loss": -0.049, "num_tokens": 2094998.0, "reward": 0.03994186595082283, "reward_std": 0.020151065662503242, "rewards/bleu_reward_func/mean": 0.03994186595082283, "rewards/bleu_reward_func/std": 0.03798232972621918, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 126.51851654052734, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1264, "grad_norm": 5.349869251251221, "kl": 0.052093505859375, "learning_rate": 1e-06, "loss": 0.0712, "num_tokens": 2104006.0, "reward": 0.060490936040878296, "reward_std": 0.039247751235961914, "rewards/bleu_reward_func/mean": 0.060490936040878296, "rewards/bleu_reward_func/std": 0.06767360866069794, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 312.375, "completions/mean_terminated_length": 266.3077087402344, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1272, "grad_norm": 6.974966526031494, "kl": 0.0902099609375, "learning_rate": 1e-06, "loss": -0.0785, "num_tokens": 2121570.0, "reward": 0.21717938780784607, "reward_std": 0.08217764645814896, "rewards/bleu_reward_func/mean": 0.21717938780784607, "rewards/bleu_reward_func/std": 0.1689896285533905, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 350.25, "completions/mean_terminated_length": 239.57894897460938, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.128, "grad_norm": 8.79633617401123, "kl": 0.10308837890625, "learning_rate": 1e-06, "loss": -0.0746, "num_tokens": 2134322.0, "reward": 0.027915209531784058, "reward_std": 0.008189969696104527, "rewards/bleu_reward_func/mean": 0.027915209531784058, "rewards/bleu_reward_func/std": 0.021798407658934593, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 98.40625, "completions/mean_terminated_length": 98.40625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1288, "grad_norm": 9.201173782348633, "kl": 0.112060546875, "learning_rate": 1e-06, "loss": 0.1926, "num_tokens": 2139903.0, "reward": 0.08629470318555832, "reward_std": 0.0329008549451828, "rewards/bleu_reward_func/mean": 0.08629470318555832, "rewards/bleu_reward_func/std": 0.04737285524606705, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 408.75, "completions/mean_terminated_length": 211.63636779785156, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.1296, "grad_norm": 3.5992963314056396, "kl": 0.034759521484375, "learning_rate": 1e-06, "loss": 0.2439, "num_tokens": 2158343.0, "reward": 0.07093626260757446, "reward_std": 0.04270578920841217, "rewards/bleu_reward_func/mean": 0.07093626260757446, "rewards/bleu_reward_func/std": 0.09919130057096481, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 171.4375, "completions/mean_terminated_length": 92.84616088867188, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1304, "grad_norm": 8.71653938293457, "kl": 0.069793701171875, "learning_rate": 1e-06, "loss": -0.2102, "num_tokens": 2166957.0, "reward": 0.045812517404556274, "reward_std": 0.0257731880992651, "rewards/bleu_reward_func/mean": 0.045812517404556274, "rewards/bleu_reward_func/std": 0.033692970871925354, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 435.65625, "completions/mean_terminated_length": 383.4210510253906, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.1312, "grad_norm": 2.308507204055786, "kl": 0.020599365234375, "learning_rate": 1e-06, "loss": -0.142, "num_tokens": 2182538.0, "reward": 0.05681996047496796, "reward_std": 0.022751763463020325, "rewards/bleu_reward_func/mean": 0.05681996047496796, "rewards/bleu_reward_func/std": 0.034446995705366135, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 422.1875, "completions/mean_terminated_length": 368.3000183105469, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.132, "grad_norm": 2.0656521320343018, "kl": 0.0328369140625, "learning_rate": 1e-06, "loss": -0.1159, "num_tokens": 2198440.0, "reward": 0.08400298655033112, "reward_std": 0.03193335980176926, "rewards/bleu_reward_func/mean": 0.08400298655033112, "rewards/bleu_reward_func/std": 0.05056838318705559, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 117.20000457763672, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1328, "grad_norm": 7.659345626831055, "kl": 0.12249755859375, "learning_rate": 1e-06, "loss": -0.0683, "num_tokens": 2212440.0, "reward": 0.17941661179065704, "reward_std": 0.040813662111759186, "rewards/bleu_reward_func/mean": 0.17941661179065704, "rewards/bleu_reward_func/std": 0.2576500475406647, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.1336, "grad_norm": 4.253745079040527, "kl": 0.03948974609375, "learning_rate": 1e-06, "loss": -0.0369, "num_tokens": 2220868.0, "reward": 0.15958541631698608, "reward_std": 0.08837255835533142, "rewards/bleu_reward_func/mean": 0.15958541631698608, "rewards/bleu_reward_func/std": 0.2750999629497528, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 258.09375, "completions/mean_terminated_length": 158.7391357421875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1344, "grad_norm": 7.613649845123291, "kl": 0.230072021484375, "learning_rate": 1e-06, "loss": 0.2337, "num_tokens": 2232063.0, "reward": 0.0643484890460968, "reward_std": 0.04164566472172737, "rewards/bleu_reward_func/mean": 0.0643484890460968, "rewards/bleu_reward_func/std": 0.07561130821704865, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 312.75, "completions/mean_terminated_length": 193.1999969482422, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1352, "grad_norm": 7.345302104949951, "kl": 0.19305419921875, "learning_rate": 1e-06, "loss": -0.2036, "num_tokens": 2248271.0, "reward": 0.04911228269338608, "reward_std": 0.018512040376663208, "rewards/bleu_reward_func/mean": 0.04911228269338608, "rewards/bleu_reward_func/std": 0.05713532865047455, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 327.0, "completions/mean_terminated_length": 230.09524536132812, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.136, "grad_norm": 5.079345226287842, "kl": 0.02252197265625, "learning_rate": 1e-06, "loss": 0.1003, "num_tokens": 2261791.0, "reward": 0.03408445790410042, "reward_std": 0.007548983674496412, "rewards/bleu_reward_func/mean": 0.03408445790410042, "rewards/bleu_reward_func/std": 0.030450724065303802, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 173.3125, "completions/mean_terminated_length": 78.47999572753906, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.1368, "grad_norm": 7.255119800567627, "kl": 0.098876953125, "learning_rate": 1e-06, "loss": -0.1893, "num_tokens": 2271041.0, "reward": 0.08309763669967651, "reward_std": 0.05162087082862854, "rewards/bleu_reward_func/mean": 0.08309763669967651, "rewards/bleu_reward_func/std": 0.08563226461410522, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 251.59375, "completions/mean_terminated_length": 164.7916717529297, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1376, "grad_norm": 9.955636024475098, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.4501, "num_tokens": 2281108.0, "reward": 0.09667688608169556, "reward_std": 0.047036267817020416, "rewards/bleu_reward_func/mean": 0.09667688608169556, "rewards/bleu_reward_func/std": 0.05911566689610481, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 251.48385620117188, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1384, "grad_norm": 2.710672616958618, "kl": 0.05035400390625, "learning_rate": 1e-06, "loss": -0.1131, "num_tokens": 2292040.0, "reward": 0.01771564967930317, "reward_std": 0.0045564379543066025, "rewards/bleu_reward_func/mean": 0.01771564967930317, "rewards/bleu_reward_func/std": 0.009397609159350395, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 279.15625, "completions/mean_terminated_length": 119.84210968017578, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.1392, "grad_norm": 5.8446946144104, "kl": 0.10295867919921875, "learning_rate": 1e-06, "loss": -0.0422, "num_tokens": 2306453.0, "reward": 0.10576937347650528, "reward_std": 0.040997594594955444, "rewards/bleu_reward_func/mean": 0.10576937347650528, "rewards/bleu_reward_func/std": 0.15739315748214722, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 406.375, "completions/mean_terminated_length": 270.5714416503906, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.14, "grad_norm": 2.8740079402923584, "kl": 0.022064208984375, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 2321945.0, "reward": 0.07392336428165436, "reward_std": 0.027644775807857513, "rewards/bleu_reward_func/mean": 0.07392336428165436, "rewards/bleu_reward_func/std": 0.079840287566185, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 384.6875, "completions/mean_terminated_length": 308.3000183105469, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1408, "grad_norm": 2.571645498275757, "kl": 0.0183868408203125, "learning_rate": 1e-06, "loss": -0.106, "num_tokens": 2338695.0, "reward": 0.043530724942684174, "reward_std": 0.02269122190773487, "rewards/bleu_reward_func/mean": 0.043530724942684174, "rewards/bleu_reward_func/std": 0.029228538274765015, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 276.71875, "completions/mean_terminated_length": 184.6521759033203, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.1416, "grad_norm": 6.338461399078369, "kl": 0.0661468505859375, "learning_rate": 1e-06, "loss": 0.2588, "num_tokens": 2349054.0, "reward": 0.04008907824754715, "reward_std": 0.03199386969208717, "rewards/bleu_reward_func/mean": 0.04008907824754715, "rewards/bleu_reward_func/std": 0.05116712674498558, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 367.84375, "completions/mean_terminated_length": 182.50001525878906, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1424, "grad_norm": 7.893227577209473, "kl": 0.1038818359375, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 2365297.0, "reward": 0.05835431069135666, "reward_std": 0.01447733398526907, "rewards/bleu_reward_func/mean": 0.05835431069135666, "rewards/bleu_reward_func/std": 0.05388018116354942, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 106.9375, "completions/mean_terminated_length": 79.93333435058594, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1432, "grad_norm": 13.133338928222656, "kl": 0.378204345703125, "learning_rate": 1e-06, "loss": 0.1078, "num_tokens": 2377279.0, "reward": 0.27373576164245605, "reward_std": 0.10149600356817245, "rewards/bleu_reward_func/mean": 0.27373576164245605, "rewards/bleu_reward_func/std": 0.21089527010917664, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 384.875, "completions/mean_terminated_length": 318.28570556640625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.144, "grad_norm": 2.6368002891540527, "kl": 0.02276611328125, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 2393507.0, "reward": 0.06703202426433563, "reward_std": 0.02514977753162384, "rewards/bleu_reward_func/mean": 0.06703202426433563, "rewards/bleu_reward_func/std": 0.05334871634840965, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 216.15625, "completions/mean_terminated_length": 147.88462829589844, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1448, "grad_norm": 26.23644256591797, "kl": 0.052398681640625, "learning_rate": 1e-06, "loss": 0.2092, "num_tokens": 2403920.0, "reward": 0.07073464244604111, "reward_std": 0.0369129553437233, "rewards/bleu_reward_func/mean": 0.07073464244604111, "rewards/bleu_reward_func/std": 0.04567345231771469, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 285.09375, "completions/mean_terminated_length": 166.23809814453125, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1456, "grad_norm": 12.993139266967773, "kl": 0.135498046875, "learning_rate": 1e-06, "loss": 0.241, "num_tokens": 2416907.0, "reward": 0.05180336907505989, "reward_std": 0.024485625326633453, "rewards/bleu_reward_func/mean": 0.05180336907505989, "rewards/bleu_reward_func/std": 0.03925548121333122, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 196.96875, "completions/mean_terminated_length": 91.95833587646484, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1464, "grad_norm": 8.714866638183594, "kl": 0.2222137451171875, "learning_rate": 1e-06, "loss": 0.1099, "num_tokens": 2428778.0, "reward": 0.08364134281873703, "reward_std": 0.042949263006448746, "rewards/bleu_reward_func/mean": 0.08364134281873703, "rewards/bleu_reward_func/std": 0.09259536862373352, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 314.09375, "completions/mean_terminated_length": 293.6206970214844, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1472, "grad_norm": 2.9456305503845215, "kl": 0.029815673828125, "learning_rate": 1e-06, "loss": -0.172, "num_tokens": 2441829.0, "reward": 0.11525549739599228, "reward_std": 0.056866977363824844, "rewards/bleu_reward_func/mean": 0.11525549739599228, "rewards/bleu_reward_func/std": 0.10229503363370895, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 310.21875, "completions/mean_terminated_length": 108.4375, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.148, "grad_norm": 4.448924541473389, "kl": 0.118255615234375, "learning_rate": 1e-06, "loss": -0.0768, "num_tokens": 2456612.0, "reward": 0.1624433547258377, "reward_std": 0.045910030603408813, "rewards/bleu_reward_func/mean": 0.1624433547258377, "rewards/bleu_reward_func/std": 0.19173115491867065, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 134.09375, "completions/mean_terminated_length": 28.279998779296875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.1488, "grad_norm": 13.645524024963379, "kl": 0.1475830078125, "learning_rate": 1e-06, "loss": 0.0664, "num_tokens": 2464839.0, "reward": 0.05004946142435074, "reward_std": 0.03280433267354965, "rewards/bleu_reward_func/mean": 0.05004946142435074, "rewards/bleu_reward_func/std": 0.05075250193476677, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 383.28125, "completions/mean_terminated_length": 332.9130554199219, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.1496, "grad_norm": 2.3289942741394043, "kl": 0.0212860107421875, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 2478608.0, "reward": 0.03798733651638031, "reward_std": 0.014268442057073116, "rewards/bleu_reward_func/mean": 0.03798733651638031, "rewards/bleu_reward_func/std": 0.03045865148305893, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 355.9375, "completions/mean_terminated_length": 303.91668701171875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.1504, "grad_norm": 3.192103862762451, "kl": 0.0250701904296875, "learning_rate": 1e-06, "loss": 0.167, "num_tokens": 2492518.0, "reward": 0.02103330008685589, "reward_std": 0.0090586943551898, "rewards/bleu_reward_func/mean": 0.02103330008685589, "rewards/bleu_reward_func/std": 0.01017869170755148, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 60.375, "completions/mean_terminated_length": 60.375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1512, "grad_norm": 10.600973129272461, "kl": 0.31732177734375, "learning_rate": 1e-06, "loss": -0.0974, "num_tokens": 2503642.0, "reward": 0.2223111093044281, "reward_std": 0.05318839102983475, "rewards/bleu_reward_func/mean": 0.2223111093044281, "rewards/bleu_reward_func/std": 0.1549021303653717, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 468.3125, "completions/mean_terminated_length": 384.9090881347656, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.152, "grad_norm": 2.152480363845825, "kl": 0.02301025390625, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 2521540.0, "reward": 0.04742058366537094, "reward_std": 0.0165211483836174, "rewards/bleu_reward_func/mean": 0.04742058366537094, "rewards/bleu_reward_func/std": 0.038380105048418045, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 295.5625, "completions/mean_terminated_length": 147.4736785888672, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.1528, "grad_norm": 3.0485126972198486, "kl": 0.0381622314453125, "learning_rate": 1e-06, "loss": 0.5548, "num_tokens": 2536702.0, "reward": 0.059137165546417236, "reward_std": 0.029524236917495728, "rewards/bleu_reward_func/mean": 0.059137165546417236, "rewards/bleu_reward_func/std": 0.04191603511571884, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 305.4375, "completions/mean_terminated_length": 211.5454559326172, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1536, "grad_norm": 3.727417230606079, "kl": 0.06072998046875, "learning_rate": 1e-06, "loss": 0.0741, "num_tokens": 2553892.0, "reward": 0.06053918972611427, "reward_std": 0.025174250826239586, "rewards/bleu_reward_func/mean": 0.06053918972611427, "rewards/bleu_reward_func/std": 0.03798559308052063, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 327.0625, "completions/mean_terminated_length": 216.10000610351562, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.1544, "grad_norm": 22.730863571166992, "kl": 0.1234130859375, "learning_rate": 1e-06, "loss": -0.1216, "num_tokens": 2569950.0, "reward": 0.14068183302879333, "reward_std": 0.05201031640172005, "rewards/bleu_reward_func/mean": 0.14068183302879333, "rewards/bleu_reward_func/std": 0.1718810796737671, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 189.71875, "completions/mean_terminated_length": 82.29167175292969, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.1552, "grad_norm": 5.675025939941406, "kl": 0.063934326171875, "learning_rate": 1e-06, "loss": -0.0105, "num_tokens": 2579693.0, "reward": 0.08947663754224777, "reward_std": 0.029948215931653976, "rewards/bleu_reward_func/mean": 0.08947663754224777, "rewards/bleu_reward_func/std": 0.06868135929107666, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 210.5625, "completions/mean_terminated_length": 167.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.156, "grad_norm": 6.797698974609375, "kl": 0.17242431640625, "learning_rate": 1e-06, "loss": -0.0415, "num_tokens": 2592967.0, "reward": 0.16623055934906006, "reward_std": 0.08808746933937073, "rewards/bleu_reward_func/mean": 0.16623055934906006, "rewards/bleu_reward_func/std": 0.17983676493167877, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 119.09375, "completions/mean_terminated_length": 119.09375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1568, "grad_norm": 5.494751453399658, "kl": 0.05902099609375, "learning_rate": 1e-06, "loss": 0.2727, "num_tokens": 2602042.0, "reward": 0.17185799777507782, "reward_std": 0.10617370158433914, "rewards/bleu_reward_func/mean": 0.17185799777507782, "rewards/bleu_reward_func/std": 0.16121239960193634, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 368.5, "completions/mean_terminated_length": 293.3333435058594, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1576, "grad_norm": 4.1480302810668945, "kl": 0.0312652587890625, "learning_rate": 1e-06, "loss": -0.1879, "num_tokens": 2615826.0, "reward": 0.051297686994075775, "reward_std": 0.018504546955227852, "rewards/bleu_reward_func/mean": 0.051297686994075775, "rewards/bleu_reward_func/std": 0.034977275878190994, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 267.53125, "completions/mean_terminated_length": 251.2333526611328, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1584, "grad_norm": 4.113983631134033, "kl": 0.027557373046875, "learning_rate": 1e-06, "loss": 0.0749, "num_tokens": 2626275.0, "reward": 0.054141815751791, "reward_std": 0.02476467750966549, "rewards/bleu_reward_func/mean": 0.054141815751791, "rewards/bleu_reward_func/std": 0.07109448313713074, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 112.96875, "completions/mean_terminated_length": 112.96875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1592, "grad_norm": 7.432074546813965, "kl": 0.1759033203125, "learning_rate": 1e-06, "loss": 0.0648, "num_tokens": 2633962.0, "reward": 0.16682901978492737, "reward_std": 0.07138749957084656, "rewards/bleu_reward_func/mean": 0.16682901978492737, "rewards/bleu_reward_func/std": 0.15276572108268738, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 451.78125, "completions/mean_terminated_length": 404.9444580078125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.16, "grad_norm": 2.110192060470581, "kl": 0.022369384765625, "learning_rate": 1e-06, "loss": -0.0268, "num_tokens": 2653971.0, "reward": 0.11942745745182037, "reward_std": 0.02005620300769806, "rewards/bleu_reward_func/mean": 0.11942745745182037, "rewards/bleu_reward_func/std": 0.09454692155122757, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 185.9375, "completions/mean_terminated_length": 110.69231414794922, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.1608, "grad_norm": 6.2729973793029785, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": -0.0659, "num_tokens": 2664601.0, "reward": 0.03557516261935234, "reward_std": 0.021523961797356606, "rewards/bleu_reward_func/mean": 0.03557516261935234, "rewards/bleu_reward_func/std": 0.02618589997291565, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 121.77777862548828, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1616, "grad_norm": 5.936282157897949, "kl": 0.0358734130859375, "learning_rate": 1e-06, "loss": -0.2742, "num_tokens": 2679849.0, "reward": 0.038136985152959824, "reward_std": 0.022807471454143524, "rewards/bleu_reward_func/mean": 0.038136985152959824, "rewards/bleu_reward_func/std": 0.061121899634599686, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 230.96875, "completions/mean_terminated_length": 221.90321350097656, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.1624, "grad_norm": 8.785550117492676, "kl": 0.1523590087890625, "learning_rate": 1e-06, "loss": -0.2049, "num_tokens": 2693024.0, "reward": 0.1289938986301422, "reward_std": 0.045512765645980835, "rewards/bleu_reward_func/mean": 0.1289938986301422, "rewards/bleu_reward_func/std": 0.09638386219739914, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 361.8125, "completions/mean_terminated_length": 168.71429443359375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1632, "grad_norm": 6.617871284484863, "kl": 0.0502777099609375, "learning_rate": 1e-06, "loss": 0.1304, "num_tokens": 2710354.0, "reward": 0.033049020916223526, "reward_std": 0.017362549901008606, "rewards/bleu_reward_func/mean": 0.033049020916223526, "rewards/bleu_reward_func/std": 0.026102159172296524, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 281.4375, "completions/mean_terminated_length": 176.63636779785156, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.164, "grad_norm": 3.961705446243286, "kl": 0.104736328125, "learning_rate": 1e-06, "loss": -0.0209, "num_tokens": 2724680.0, "reward": 0.1750263273715973, "reward_std": 0.02830299735069275, "rewards/bleu_reward_func/mean": 0.1750263273715973, "rewards/bleu_reward_func/std": 0.13747908174991608, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 287.28125, "completions/mean_terminated_length": 255.1785888671875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.1648, "grad_norm": 3.098118305206299, "kl": 0.020477294921875, "learning_rate": 1e-06, "loss": -0.1863, "num_tokens": 2736209.0, "reward": 0.06041261553764343, "reward_std": 0.033261410892009735, "rewards/bleu_reward_func/mean": 0.06041261553764343, "rewards/bleu_reward_func/std": 0.046081364154815674, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 230.1875, "completions/mean_terminated_length": 136.25, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.1656, "grad_norm": 6.539205551147461, "kl": 0.0445404052734375, "learning_rate": 1e-06, "loss": 0.0949, "num_tokens": 2749503.0, "reward": 0.039952248334884644, "reward_std": 0.05510722100734711, "rewards/bleu_reward_func/mean": 0.039952248334884644, "rewards/bleu_reward_func/std": 0.08833327889442444, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 385.8125, "completions/mean_terminated_length": 223.57144165039062, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.1664, "grad_norm": 3.262167453765869, "kl": 0.0330657958984375, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 2763289.0, "reward": 0.06319095194339752, "reward_std": 0.021728292107582092, "rewards/bleu_reward_func/mean": 0.06319095194339752, "rewards/bleu_reward_func/std": 0.03750937059521675, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 182.71875, "completions/mean_terminated_length": 172.09677124023438, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.1672, "grad_norm": 6.667765140533447, "kl": 0.113616943359375, "learning_rate": 1e-06, "loss": 0.3285, "num_tokens": 2773104.0, "reward": 0.1846303939819336, "reward_std": 0.16774994134902954, "rewards/bleu_reward_func/mean": 0.1846303939819336, "rewards/bleu_reward_func/std": 0.20520828664302826, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 204.6875, "completions/mean_terminated_length": 147.7777862548828, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.168, "grad_norm": 5.794483661651611, "kl": 0.09796142578125, "learning_rate": 1e-06, "loss": 0.1363, "num_tokens": 2787670.0, "reward": 0.09086121618747711, "reward_std": 0.052026841789484024, "rewards/bleu_reward_func/mean": 0.09086121618747711, "rewards/bleu_reward_func/std": 0.09278357774019241, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 399.8125, "completions/mean_terminated_length": 113.11111450195312, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.1688, "grad_norm": 11.576338768005371, "kl": 0.0574493408203125, "learning_rate": 1e-06, "loss": 0.1204, "num_tokens": 2805864.0, "reward": 0.023652518168091774, "reward_std": 0.01210303045809269, "rewards/bleu_reward_func/mean": 0.023652518168091774, "rewards/bleu_reward_func/std": 0.02501726523041725, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 157.6875, "completions/mean_terminated_length": 58.47999954223633, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1696, "grad_norm": 10.228835105895996, "kl": 0.1674041748046875, "learning_rate": 1e-06, "loss": 0.0582, "num_tokens": 2816758.0, "reward": 0.13800185918807983, "reward_std": 0.047296687960624695, "rewards/bleu_reward_func/mean": 0.13800185918807983, "rewards/bleu_reward_func/std": 0.0863277018070221, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 402.15625, "completions/mean_terminated_length": 359.1739196777344, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.1704, "grad_norm": 2.593717336654663, "kl": 0.01934814453125, "learning_rate": 1e-06, "loss": -0.0091, "num_tokens": 2833347.0, "reward": 0.05193600431084633, "reward_std": 0.018484318628907204, "rewards/bleu_reward_func/mean": 0.05193600431084633, "rewards/bleu_reward_func/std": 0.04251272976398468, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 198.25, "completions/mean_terminated_length": 198.25, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1712, "grad_norm": 6.071621894836426, "kl": 0.057373046875, "learning_rate": 1e-06, "loss": -0.1354, "num_tokens": 2841195.0, "reward": 0.062206219881772995, "reward_std": 0.03749649226665497, "rewards/bleu_reward_func/mean": 0.062206219881772995, "rewards/bleu_reward_func/std": 0.0528765432536602, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 246.40625, "completions/mean_terminated_length": 218.9310302734375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.172, "grad_norm": 4.833486557006836, "kl": 0.068695068359375, "learning_rate": 1e-06, "loss": 0.1631, "num_tokens": 2851224.0, "reward": 0.06542235612869263, "reward_std": 0.03771442174911499, "rewards/bleu_reward_func/mean": 0.06542235612869263, "rewards/bleu_reward_func/std": 0.0579860620200634, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 465.15625, "completions/mean_terminated_length": 412.0666809082031, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.1728, "grad_norm": 2.1820828914642334, "kl": 0.0188446044921875, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 2870765.0, "reward": 0.06440776586532593, "reward_std": 0.013088207691907883, "rewards/bleu_reward_func/mean": 0.06440776586532593, "rewards/bleu_reward_func/std": 0.06307429075241089, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 224.0625, "completions/mean_terminated_length": 170.74073791503906, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1736, "grad_norm": 9.596390724182129, "kl": 0.2492218017578125, "learning_rate": 1e-06, "loss": 0.1621, "num_tokens": 2882151.0, "reward": 0.15283548831939697, "reward_std": 0.08103044331073761, "rewards/bleu_reward_func/mean": 0.15283548831939697, "rewards/bleu_reward_func/std": 0.13223250210285187, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 464.09375, "completions/mean_terminated_length": 320.375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1744, "grad_norm": 2.1536099910736084, "kl": 0.02032470703125, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 2900658.0, "reward": 0.02000538259744644, "reward_std": 0.008671639487147331, "rewards/bleu_reward_func/mean": 0.02000538259744644, "rewards/bleu_reward_func/std": 0.01867109164595604, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 55.68000030517578, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1752, "grad_norm": 8.202893257141113, "kl": 0.2074127197265625, "learning_rate": 1e-06, "loss": -0.0157, "num_tokens": 2909778.0, "reward": 0.1296558678150177, "reward_std": 0.04394569993019104, "rewards/bleu_reward_func/mean": 0.1296558678150177, "rewards/bleu_reward_func/std": 0.05605300888419151, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 350.40625, "completions/mean_terminated_length": 188.8125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.176, "grad_norm": 5.133015155792236, "kl": 0.0847930908203125, "learning_rate": 1e-06, "loss": 0.2562, "num_tokens": 2926239.0, "reward": 0.1607290506362915, "reward_std": 0.12061528861522675, "rewards/bleu_reward_func/mean": 0.1607290506362915, "rewards/bleu_reward_func/std": 0.19297951459884644, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 377.09375, "completions/mean_terminated_length": 306.4285888671875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.1768, "grad_norm": 2.917404890060425, "kl": 0.02947998046875, "learning_rate": 1e-06, "loss": 0.0902, "num_tokens": 2940970.0, "reward": 0.05033531412482262, "reward_std": 0.015085380524396896, "rewards/bleu_reward_func/mean": 0.05033531412482262, "rewards/bleu_reward_func/std": 0.03601166605949402, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 98.625, "completions/mean_terminated_length": 98.625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.1776, "grad_norm": 9.739842414855957, "kl": 0.28759765625, "learning_rate": 1e-06, "loss": 0.1954, "num_tokens": 2951942.0, "reward": 0.18511344492435455, "reward_std": 0.09618590772151947, "rewards/bleu_reward_func/mean": 0.18511344492435455, "rewards/bleu_reward_func/std": 0.13407698273658752, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 155.1875, "completions/mean_terminated_length": 131.40000915527344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1784, "grad_norm": 5.931830883026123, "kl": 0.082611083984375, "learning_rate": 1e-06, "loss": 0.0924, "num_tokens": 2959460.0, "reward": 0.07271347939968109, "reward_std": 0.05200031027197838, "rewards/bleu_reward_func/mean": 0.07271347939968109, "rewards/bleu_reward_func/std": 0.06765022873878479, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 130.71875, "completions/mean_terminated_length": 105.30000305175781, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.1792, "grad_norm": 7.8368730545043945, "kl": 0.09417724609375, "learning_rate": 1e-06, "loss": 0.1685, "num_tokens": 2966491.0, "reward": 0.0899183601140976, "reward_std": 0.05122753232717514, "rewards/bleu_reward_func/mean": 0.0899183601140976, "rewards/bleu_reward_func/std": 0.11120127141475677, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 367.5, "completions/mean_terminated_length": 268.631591796875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.18, "grad_norm": 3.555055618286133, "kl": 0.0318603515625, "learning_rate": 1e-06, "loss": -0.0636, "num_tokens": 2982515.0, "reward": 0.11773502081632614, "reward_std": 0.046606093645095825, "rewards/bleu_reward_func/mean": 0.11773502081632614, "rewards/bleu_reward_func/std": 0.15673232078552246, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 122.65625, "completions/mean_terminated_length": 122.65625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1808, "grad_norm": 10.452176094055176, "kl": 0.1519775390625, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 2991512.0, "reward": 0.13446207344532013, "reward_std": 0.060547836124897, "rewards/bleu_reward_func/mean": 0.13446207344532013, "rewards/bleu_reward_func/std": 0.07454977184534073, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 345.6875, "completions/mean_terminated_length": 258.5714416503906, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.1816, "grad_norm": 2.964317560195923, "kl": 0.038421630859375, "learning_rate": 1e-06, "loss": 0.1194, "num_tokens": 3005678.0, "reward": 0.14132392406463623, "reward_std": 0.05001860111951828, "rewards/bleu_reward_func/mean": 0.14132392406463623, "rewards/bleu_reward_func/std": 0.08175285160541534, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 390.125, "completions/mean_terminated_length": 122.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.1824, "grad_norm": 5.049752235412598, "kl": 0.04425048828125, "learning_rate": 1e-06, "loss": 0.1754, "num_tokens": 3021434.0, "reward": 0.04336467757821083, "reward_std": 0.018742987886071205, "rewards/bleu_reward_func/mean": 0.04336467757821083, "rewards/bleu_reward_func/std": 0.03402964025735855, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 364.0625, "completions/mean_terminated_length": 249.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1832, "grad_norm": 3.026240348815918, "kl": 0.0245361328125, "learning_rate": 1e-06, "loss": -0.2846, "num_tokens": 3040956.0, "reward": 0.028285246342420578, "reward_std": 0.018473699688911438, "rewards/bleu_reward_func/mean": 0.028285246342420578, "rewards/bleu_reward_func/std": 0.02460222877562046, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 370.1875, "completions/mean_terminated_length": 330.47998046875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.184, "grad_norm": 2.621922731399536, "kl": 0.0340728759765625, "learning_rate": 1e-06, "loss": -0.0699, "num_tokens": 3058866.0, "reward": 0.18184542655944824, "reward_std": 0.06604617834091187, "rewards/bleu_reward_func/mean": 0.18184542655944824, "rewards/bleu_reward_func/std": 0.16794371604919434, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 213.71875, "completions/mean_terminated_length": 97.0, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.1848, "grad_norm": 5.496671676635742, "kl": 0.11822509765625, "learning_rate": 1e-06, "loss": 0.4021, "num_tokens": 3071713.0, "reward": 0.22397759556770325, "reward_std": 0.09391038119792938, "rewards/bleu_reward_func/mean": 0.22397759556770325, "rewards/bleu_reward_func/std": 0.19180122017860413, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 197.84375, "completions/mean_terminated_length": 93.125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.1856, "grad_norm": 3.808242082595825, "kl": 0.04571533203125, "learning_rate": 1e-06, "loss": 0.0821, "num_tokens": 3079892.0, "reward": 0.060666900128126144, "reward_std": 0.029011715203523636, "rewards/bleu_reward_func/mean": 0.060666900128126144, "rewards/bleu_reward_func/std": 0.0762709304690361, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 160.40625, "completions/mean_terminated_length": 61.959999084472656, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.1864, "grad_norm": 11.2310791015625, "kl": 0.160797119140625, "learning_rate": 1e-06, "loss": 0.2881, "num_tokens": 3087689.0, "reward": 0.07089974731206894, "reward_std": 0.03123306669294834, "rewards/bleu_reward_func/mean": 0.07089974731206894, "rewards/bleu_reward_func/std": 0.06456828862428665, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 66.875, "completions/mean_terminated_length": 66.875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.1872, "grad_norm": 13.989295959472656, "kl": 0.3311767578125, "learning_rate": 1e-06, "loss": -0.0331, "num_tokens": 3093357.0, "reward": 0.15325351059436798, "reward_std": 0.0506255105137825, "rewards/bleu_reward_func/mean": 0.15325351059436798, "rewards/bleu_reward_func/std": 0.19497260451316833, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 260.75, "completions/mean_terminated_length": 162.43478393554688, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.188, "grad_norm": 7.557122230529785, "kl": 0.173126220703125, "learning_rate": 1e-06, "loss": 0.1592, "num_tokens": 3105749.0, "reward": 0.20930011570453644, "reward_std": 0.06161898747086525, "rewards/bleu_reward_func/mean": 0.20930011570453644, "rewards/bleu_reward_func/std": 0.2159973680973053, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 404.9375, "completions/mean_terminated_length": 310.4705810546875, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1888, "grad_norm": 4.613722324371338, "kl": 0.025421142578125, "learning_rate": 1e-06, "loss": -0.1006, "num_tokens": 3121795.0, "reward": 0.02748030610382557, "reward_std": 0.0075658103451132774, "rewards/bleu_reward_func/mean": 0.02748030610382557, "rewards/bleu_reward_func/std": 0.03438537195324898, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 389.09375, "completions/mean_terminated_length": 266.1875, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1896, "grad_norm": 2.435314178466797, "kl": 0.0276031494140625, "learning_rate": 1e-06, "loss": -0.1746, "num_tokens": 3140006.0, "reward": 0.10853572189807892, "reward_std": 0.05605427548289299, "rewards/bleu_reward_func/mean": 0.10853572189807892, "rewards/bleu_reward_func/std": 0.1485956311225891, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 149.78125, "completions/mean_terminated_length": 29.041667938232422, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.1904, "grad_norm": 8.839442253112793, "kl": 0.2632598876953125, "learning_rate": 1e-06, "loss": 0.0665, "num_tokens": 3149743.0, "reward": 0.13384486734867096, "reward_std": 0.03735985979437828, "rewards/bleu_reward_func/mean": 0.13384486734867096, "rewards/bleu_reward_func/std": 0.17275770008563995, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 178.1666717529297, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.1912, "grad_norm": 4.326257228851318, "kl": 0.14703369140625, "learning_rate": 1e-06, "loss": -0.0395, "num_tokens": 3163195.0, "reward": 0.16435688734054565, "reward_std": 0.051772814244031906, "rewards/bleu_reward_func/mean": 0.16435688734054565, "rewards/bleu_reward_func/std": 0.13062016665935516, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 359.4375, "completions/mean_terminated_length": 255.05262756347656, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.192, "grad_norm": 6.709453582763672, "kl": 0.097503662109375, "learning_rate": 1e-06, "loss": -0.0418, "num_tokens": 3180209.0, "reward": 0.10101380944252014, "reward_std": 0.030364379286766052, "rewards/bleu_reward_func/mean": 0.10101380944252014, "rewards/bleu_reward_func/std": 0.08647928386926651, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 93.43999481201172, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.1928, "grad_norm": 9.118388175964355, "kl": 0.14984130859375, "learning_rate": 1e-06, "loss": -0.0512, "num_tokens": 3189425.0, "reward": 0.19255727529525757, "reward_std": 0.03786986321210861, "rewards/bleu_reward_func/mean": 0.19255727529525757, "rewards/bleu_reward_func/std": 0.18927834928035736, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 162.4375, "completions/mean_terminated_length": 81.76923370361328, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.1936, "grad_norm": 7.5658745765686035, "kl": 0.1087493896484375, "learning_rate": 1e-06, "loss": 0.11, "num_tokens": 3196911.0, "reward": 0.08898752182722092, "reward_std": 0.01980067417025566, "rewards/bleu_reward_func/mean": 0.08898752182722092, "rewards/bleu_reward_func/std": 0.09810609370470047, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 238.8125, "completions/mean_terminated_length": 175.7692413330078, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.1944, "grad_norm": 3.6591224670410156, "kl": 0.068145751953125, "learning_rate": 1e-06, "loss": 0.0677, "num_tokens": 3212161.0, "reward": 0.16356298327445984, "reward_std": 0.08266205340623856, "rewards/bleu_reward_func/mean": 0.16356298327445984, "rewards/bleu_reward_func/std": 0.17177340388298035, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 369.0625, "completions/mean_terminated_length": 294.19049072265625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.1952, "grad_norm": 2.8115499019622803, "kl": 0.032623291015625, "learning_rate": 1e-06, "loss": 0.0943, "num_tokens": 3228483.0, "reward": 0.06906401365995407, "reward_std": 0.025964463129639626, "rewards/bleu_reward_func/mean": 0.06906401365995407, "rewards/bleu_reward_func/std": 0.044564370065927505, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 253.875, "completions/mean_terminated_length": 194.3076934814453, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.196, "grad_norm": 6.153151512145996, "kl": 0.072845458984375, "learning_rate": 1e-06, "loss": 0.1336, "num_tokens": 3238447.0, "reward": 0.05225534737110138, "reward_std": 0.019162572920322418, "rewards/bleu_reward_func/mean": 0.05225534737110138, "rewards/bleu_reward_func/std": 0.04069560393691063, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 357.6875, "completions/mean_terminated_length": 237.6666717529297, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.1968, "grad_norm": 4.332682132720947, "kl": 0.074615478515625, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 3252661.0, "reward": 0.06644366681575775, "reward_std": 0.029834389686584473, "rewards/bleu_reward_func/mean": 0.06644366681575775, "rewards/bleu_reward_func/std": 0.0527600534260273, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 280.84375, "completions/mean_terminated_length": 203.7916717529297, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.1976, "grad_norm": 3.9714043140411377, "kl": 0.046600341796875, "learning_rate": 1e-06, "loss": 0.0574, "num_tokens": 3263392.0, "reward": 0.04084426164627075, "reward_std": 0.022724341601133347, "rewards/bleu_reward_func/mean": 0.04084426164627075, "rewards/bleu_reward_func/std": 0.03625248372554779, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 246.1875, "completions/mean_terminated_length": 171.75999450683594, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.1984, "grad_norm": 7.287817478179932, "kl": 0.128570556640625, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 3274190.0, "reward": 0.05778396502137184, "reward_std": 0.020291190594434738, "rewards/bleu_reward_func/mean": 0.05778396502137184, "rewards/bleu_reward_func/std": 0.046611472964286804, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 317.21875, "completions/mean_terminated_length": 183.94737243652344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1992, "grad_norm": 9.650996208190918, "kl": 0.1335906982421875, "learning_rate": 1e-06, "loss": -0.0956, "num_tokens": 3288605.0, "reward": 0.15271537005901337, "reward_std": 0.0891089141368866, "rewards/bleu_reward_func/mean": 0.15271537005901337, "rewards/bleu_reward_func/std": 0.1993638128042221, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 238.0869598388672, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2, "grad_norm": 8.128390312194824, "kl": 0.073394775390625, "learning_rate": 1e-06, "loss": 0.1129, "num_tokens": 3303449.0, "reward": 0.05582565814256668, "reward_std": 0.04732588678598404, "rewards/bleu_reward_func/mean": 0.05582565814256668, "rewards/bleu_reward_func/std": 0.06975270062685013, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 397.34375, "completions/mean_terminated_length": 282.6875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.2008, "grad_norm": 2.3954126834869385, "kl": 0.027587890625, "learning_rate": 1e-06, "loss": -0.0758, "num_tokens": 3322684.0, "reward": 0.20381565392017365, "reward_std": 0.06331950426101685, "rewards/bleu_reward_func/mean": 0.20381565392017365, "rewards/bleu_reward_func/std": 0.30689555406570435, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 177.29031372070312, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2016, "grad_norm": 10.365123748779297, "kl": 0.403076171875, "learning_rate": 1e-06, "loss": -0.0409, "num_tokens": 3332612.0, "reward": 0.09179520606994629, "reward_std": 0.042515259236097336, "rewards/bleu_reward_func/mean": 0.09179520606994629, "rewards/bleu_reward_func/std": 0.06000783294439316, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 256.4375, "completions/mean_terminated_length": 184.87998962402344, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2024, "grad_norm": 5.642463207244873, "kl": 0.1347198486328125, "learning_rate": 1e-06, "loss": -0.0626, "num_tokens": 3347706.0, "reward": 0.12519359588623047, "reward_std": 0.036009326577186584, "rewards/bleu_reward_func/mean": 0.12519359588623047, "rewards/bleu_reward_func/std": 0.1556256264448166, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 133.4375, "completions/mean_terminated_length": 79.35714721679688, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2032, "grad_norm": 13.66992473602295, "kl": 0.30645751953125, "learning_rate": 1e-06, "loss": -0.1315, "num_tokens": 3353864.0, "reward": 0.10196495056152344, "reward_std": 0.05300650745630264, "rewards/bleu_reward_func/mean": 0.10196495056152344, "rewards/bleu_reward_func/std": 0.09023614972829819, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 267.6875, "completions/mean_terminated_length": 232.7857208251953, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.204, "grad_norm": 24.040653228759766, "kl": 0.062896728515625, "learning_rate": 1e-06, "loss": -0.1232, "num_tokens": 3365758.0, "reward": 0.03941156342625618, "reward_std": 0.017305800691246986, "rewards/bleu_reward_func/mean": 0.03941156342625618, "rewards/bleu_reward_func/std": 0.02295033633708954, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 420.21875, "completions/mean_terminated_length": 316.20001220703125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.2048, "grad_norm": 2.9320602416992188, "kl": 0.033233642578125, "learning_rate": 1e-06, "loss": -0.1371, "num_tokens": 3382765.0, "reward": 0.05339156836271286, "reward_std": 0.02982841432094574, "rewards/bleu_reward_func/mean": 0.05339156836271286, "rewards/bleu_reward_func/std": 0.07343700528144836, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 247.8125, "completions/mean_terminated_length": 173.83999633789062, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2056, "grad_norm": 8.614324569702148, "kl": 0.1400146484375, "learning_rate": 1e-06, "loss": 0.1332, "num_tokens": 3394391.0, "reward": 0.06851230561733246, "reward_std": 0.04152427613735199, "rewards/bleu_reward_func/mean": 0.06851230561733246, "rewards/bleu_reward_func/std": 0.056356508284807205, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 365.625, "completions/mean_terminated_length": 199.73333740234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2064, "grad_norm": 6.318526744842529, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": -0.0091, "num_tokens": 3411195.0, "reward": 0.08351869136095047, "reward_std": 0.012093533761799335, "rewards/bleu_reward_func/mean": 0.08351869136095047, "rewards/bleu_reward_func/std": 0.08073550462722778, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 134.65625, "completions/mean_terminated_length": 109.50000762939453, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2072, "grad_norm": 7.35445499420166, "kl": 0.2371826171875, "learning_rate": 1e-06, "loss": 0.4692, "num_tokens": 3419136.0, "reward": 0.15089674293994904, "reward_std": 0.06239618360996246, "rewards/bleu_reward_func/mean": 0.15089674293994904, "rewards/bleu_reward_func/std": 0.09912555664777756, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 242.46875, "completions/mean_terminated_length": 180.2692413330078, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.208, "grad_norm": 4.740394592285156, "kl": 0.08489990234375, "learning_rate": 1e-06, "loss": 0.0594, "num_tokens": 3432127.0, "reward": 0.05275239422917366, "reward_std": 0.050225820392370224, "rewards/bleu_reward_func/mean": 0.05275239422917366, "rewards/bleu_reward_func/std": 0.07898835092782974, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 331.84375, "completions/mean_terminated_length": 223.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.2088, "grad_norm": 3.1740782260894775, "kl": 0.05401611328125, "learning_rate": 1e-06, "loss": -0.0923, "num_tokens": 3446746.0, "reward": 0.12386887520551682, "reward_std": 0.031204696744680405, "rewards/bleu_reward_func/mean": 0.12386887520551682, "rewards/bleu_reward_func/std": 0.1644604653120041, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 329.5, "completions/mean_terminated_length": 268.66668701171875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.2096, "grad_norm": 2.937896728515625, "kl": 0.033477783203125, "learning_rate": 1e-06, "loss": -0.0197, "num_tokens": 3460890.0, "reward": 0.05950773134827614, "reward_std": 0.017293047159910202, "rewards/bleu_reward_func/mean": 0.05950773134827614, "rewards/bleu_reward_func/std": 0.04094443470239639, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 135.40625, "completions/mean_terminated_length": 135.40625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2104, "grad_norm": 8.865147590637207, "kl": 0.2249755859375, "learning_rate": 1e-06, "loss": -0.0363, "num_tokens": 3475103.0, "reward": 0.20508863031864166, "reward_std": 0.040958937257528305, "rewards/bleu_reward_func/mean": 0.20508863031864166, "rewards/bleu_reward_func/std": 0.14616157114505768, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 267.8125, "completions/mean_terminated_length": 186.4166717529297, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.2112, "grad_norm": 9.684611320495605, "kl": 0.241973876953125, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 3487049.0, "reward": 0.098166324198246, "reward_std": 0.040819209069013596, "rewards/bleu_reward_func/mean": 0.098166324198246, "rewards/bleu_reward_func/std": 0.08471043407917023, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 129.3333282470703, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.212, "grad_norm": 10.798442840576172, "kl": 0.1309814453125, "learning_rate": 1e-06, "loss": 0.3087, "num_tokens": 3501029.0, "reward": 0.12524467706680298, "reward_std": 0.05395754426717758, "rewards/bleu_reward_func/mean": 0.12524467706680298, "rewards/bleu_reward_func/std": 0.1178852915763855, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 205.71875, "completions/mean_terminated_length": 103.625, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2128, "grad_norm": 6.346302032470703, "kl": 0.132049560546875, "learning_rate": 1e-06, "loss": 0.0884, "num_tokens": 3511372.0, "reward": 0.10632273554801941, "reward_std": 0.041688427329063416, "rewards/bleu_reward_func/mean": 0.10632273554801941, "rewards/bleu_reward_func/std": 0.09963962435722351, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 231.4375, "completions/mean_terminated_length": 137.9166717529297, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2136, "grad_norm": 9.553611755371094, "kl": 0.2503204345703125, "learning_rate": 1e-06, "loss": -0.085, "num_tokens": 3524570.0, "reward": 0.08381873369216919, "reward_std": 0.026928268373012543, "rewards/bleu_reward_func/mean": 0.08381873369216919, "rewards/bleu_reward_func/std": 0.06075910106301308, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 266.8125, "completions/mean_terminated_length": 198.1599884033203, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.2144, "grad_norm": 5.0754289627075195, "kl": 0.09002685546875, "learning_rate": 1e-06, "loss": -0.0985, "num_tokens": 3535156.0, "reward": 0.04936995357275009, "reward_std": 0.02683193050324917, "rewards/bleu_reward_func/mean": 0.04936995357275009, "rewards/bleu_reward_func/std": 0.05894342064857483, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 432.5, "completions/mean_terminated_length": 300.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.2152, "grad_norm": 2.118546724319458, "kl": 0.0207061767578125, "learning_rate": 1e-06, "loss": -0.0481, "num_tokens": 3555804.0, "reward": 0.05241474509239197, "reward_std": 0.019338509067893028, "rewards/bleu_reward_func/mean": 0.05241474509239197, "rewards/bleu_reward_func/std": 0.06824250519275665, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 192.83334350585938, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.216, "grad_norm": 3.938976526260376, "kl": 0.042999267578125, "learning_rate": 1e-06, "loss": -0.1052, "num_tokens": 3572328.0, "reward": 0.21750634908676147, "reward_std": 0.06779822707176208, "rewards/bleu_reward_func/mean": 0.21750634908676147, "rewards/bleu_reward_func/std": 0.28914642333984375, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 293.3125, "completions/mean_terminated_length": 178.76190185546875, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.2168, "grad_norm": 3.788853645324707, "kl": 0.041900634765625, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 3587050.0, "reward": 0.04385410249233246, "reward_std": 0.030311163514852524, "rewards/bleu_reward_func/mean": 0.04385410249233246, "rewards/bleu_reward_func/std": 0.047958169132471085, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 232.25, "completions/mean_terminated_length": 139.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2176, "grad_norm": 12.908583641052246, "kl": 0.464263916015625, "learning_rate": 1e-06, "loss": 0.2404, "num_tokens": 3598874.0, "reward": 0.1504618227481842, "reward_std": 0.04004389047622681, "rewards/bleu_reward_func/mean": 0.1504618227481842, "rewards/bleu_reward_func/std": 0.16537794470787048, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 209.03125, "completions/mean_terminated_length": 199.258056640625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.2184, "grad_norm": 6.985334873199463, "kl": 0.16595458984375, "learning_rate": 1e-06, "loss": -0.0408, "num_tokens": 3610515.0, "reward": 0.21218228340148926, "reward_std": 0.09676108509302139, "rewards/bleu_reward_func/mean": 0.21218228340148926, "rewards/bleu_reward_func/std": 0.22182048857212067, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 333.9375, "completions/mean_terminated_length": 195.44444274902344, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2192, "grad_norm": 3.482099771499634, "kl": 0.03741455078125, "learning_rate": 1e-06, "loss": -0.1819, "num_tokens": 3623921.0, "reward": 0.11982771754264832, "reward_std": 0.063297338783741, "rewards/bleu_reward_func/mean": 0.11982771754264832, "rewards/bleu_reward_func/std": 0.09915972501039505, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 292.90625, "completions/mean_terminated_length": 261.6071472167969, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.22, "grad_norm": 4.1525559425354, "kl": 0.041351318359375, "learning_rate": 1e-06, "loss": -0.0478, "num_tokens": 3634918.0, "reward": 0.05305434763431549, "reward_std": 0.019571729004383087, "rewards/bleu_reward_func/mean": 0.05305434763431549, "rewards/bleu_reward_func/std": 0.04326590150594711, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 229.59375, "completions/mean_terminated_length": 135.45834350585938, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2208, "grad_norm": 14.463852882385254, "kl": 0.257415771484375, "learning_rate": 1e-06, "loss": -0.0433, "num_tokens": 3647289.0, "reward": 0.23456689715385437, "reward_std": 0.08336643874645233, "rewards/bleu_reward_func/mean": 0.23456689715385437, "rewards/bleu_reward_func/std": 0.2258531004190445, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 193.375, "completions/mean_terminated_length": 87.16667175292969, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2216, "grad_norm": 21.709369659423828, "kl": 0.1391448974609375, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 3656709.0, "reward": 0.16775630414485931, "reward_std": 0.03647792339324951, "rewards/bleu_reward_func/mean": 0.16775630414485931, "rewards/bleu_reward_func/std": 0.15713484585285187, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 219.78125, "completions/mean_terminated_length": 122.375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2224, "grad_norm": 7.275771141052246, "kl": 0.22491455078125, "learning_rate": 1e-06, "loss": 0.072, "num_tokens": 3670158.0, "reward": 0.1231408566236496, "reward_std": 0.022272268310189247, "rewards/bleu_reward_func/mean": 0.1231408566236496, "rewards/bleu_reward_func/std": 0.1077708899974823, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 339.46875, "completions/mean_terminated_length": 261.04547119140625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2232, "grad_norm": 3.146303176879883, "kl": 0.0390625, "learning_rate": 1e-06, "loss": -0.1333, "num_tokens": 3683925.0, "reward": 0.0675458312034607, "reward_std": 0.017428681254386902, "rewards/bleu_reward_func/mean": 0.0675458312034607, "rewards/bleu_reward_func/std": 0.05334463343024254, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 321.9375, "completions/mean_terminated_length": 268.7200012207031, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.224, "grad_norm": 8.726150512695312, "kl": 0.156707763671875, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 3699747.0, "reward": 0.11248552799224854, "reward_std": 0.03111671656370163, "rewards/bleu_reward_func/mean": 0.11248552799224854, "rewards/bleu_reward_func/std": 0.08908119797706604, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 297.375, "completions/mean_terminated_length": 247.84616088867188, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2248, "grad_norm": 3.081026077270508, "kl": 0.021881103515625, "learning_rate": 1e-06, "loss": -0.1217, "num_tokens": 3712759.0, "reward": 0.09313205629587173, "reward_std": 0.03823218122124672, "rewards/bleu_reward_func/mean": 0.09313205629587173, "rewards/bleu_reward_func/std": 0.06713149696588516, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 316.46875, "completions/mean_terminated_length": 251.2916717529297, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2256, "grad_norm": 3.1275222301483154, "kl": 0.035675048828125, "learning_rate": 1e-06, "loss": -0.1645, "num_tokens": 3725598.0, "reward": 0.032498396933078766, "reward_std": 0.018658628687262535, "rewards/bleu_reward_func/mean": 0.032498396933078766, "rewards/bleu_reward_func/std": 0.019405974075198174, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 194.71875, "completions/mean_terminated_length": 173.56668090820312, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2264, "grad_norm": 4.276275634765625, "kl": 0.08013916015625, "learning_rate": 1e-06, "loss": 0.0746, "num_tokens": 3736861.0, "reward": 0.12694165110588074, "reward_std": 0.04432743415236473, "rewards/bleu_reward_func/mean": 0.12694165110588074, "rewards/bleu_reward_func/std": 0.13188457489013672, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 101.68421173095703, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2272, "grad_norm": 7.712943077087402, "kl": 0.271392822265625, "learning_rate": 1e-06, "loss": -0.0787, "num_tokens": 3751545.0, "reward": 0.1655203104019165, "reward_std": 0.08383054286241531, "rewards/bleu_reward_func/mean": 0.1655203104019165, "rewards/bleu_reward_func/std": 0.1525241732597351, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 342.125, "completions/mean_terminated_length": 225.89474487304688, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.228, "grad_norm": 3.6280434131622314, "kl": 0.046173095703125, "learning_rate": 1e-06, "loss": -0.1072, "num_tokens": 3765781.0, "reward": 0.042814724147319794, "reward_std": 0.026553209871053696, "rewards/bleu_reward_func/mean": 0.042814724147319794, "rewards/bleu_reward_func/std": 0.03911494091153145, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 208.96875, "completions/mean_terminated_length": 199.19354248046875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2288, "grad_norm": 7.985737323760986, "kl": 0.12091064453125, "learning_rate": 1e-06, "loss": -0.1044, "num_tokens": 3779220.0, "reward": 0.11331084370613098, "reward_std": 0.025679122656583786, "rewards/bleu_reward_func/mean": 0.11331084370613098, "rewards/bleu_reward_func/std": 0.16165612637996674, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 351.3125, "completions/mean_terminated_length": 209.5294189453125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.2296, "grad_norm": 3.912679433822632, "kl": 0.026214599609375, "learning_rate": 1e-06, "loss": 0.1208, "num_tokens": 3794550.0, "reward": 0.01693039759993553, "reward_std": 0.0203933697193861, "rewards/bleu_reward_func/mean": 0.01693039759993553, "rewards/bleu_reward_func/std": 0.02536601759493351, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 274.65625, "completions/mean_terminated_length": 240.75001525878906, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2304, "grad_norm": 6.236807346343994, "kl": 0.1007537841796875, "learning_rate": 1e-06, "loss": -0.0844, "num_tokens": 3808595.0, "reward": 0.13739125430583954, "reward_std": 0.042728863656520844, "rewards/bleu_reward_func/mean": 0.13739125430583954, "rewards/bleu_reward_func/std": 0.09978168457746506, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 260.15625, "completions/mean_terminated_length": 87.84210968017578, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2312, "grad_norm": 12.087539672851562, "kl": 0.2603912353515625, "learning_rate": 1e-06, "loss": 0.1979, "num_tokens": 3821552.0, "reward": 0.1537414938211441, "reward_std": 0.04864966496825218, "rewards/bleu_reward_func/mean": 0.1537414938211441, "rewards/bleu_reward_func/std": 0.08011970669031143, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 214.40625, "completions/mean_terminated_length": 79.13636779785156, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.232, "grad_norm": 6.511635780334473, "kl": 0.12432861328125, "learning_rate": 1e-06, "loss": 0.2609, "num_tokens": 3838117.0, "reward": 0.19495005905628204, "reward_std": 0.09461250901222229, "rewards/bleu_reward_func/mean": 0.19495005905628204, "rewards/bleu_reward_func/std": 0.20672400295734406, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 322.78125, "completions/mean_terminated_length": 223.6666717529297, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.2328, "grad_norm": 4.590160369873047, "kl": 0.127716064453125, "learning_rate": 1e-06, "loss": -0.1225, "num_tokens": 3853958.0, "reward": 0.1360878348350525, "reward_std": 0.03053300268948078, "rewards/bleu_reward_func/mean": 0.1360878348350525, "rewards/bleu_reward_func/std": 0.17878462374210358, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 462.6875, "completions/mean_terminated_length": 390.6153869628906, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.2336, "grad_norm": 2.3334367275238037, "kl": 0.030426025390625, "learning_rate": 1e-06, "loss": -0.0405, "num_tokens": 3875596.0, "reward": 0.06421424448490143, "reward_std": 0.02072659507393837, "rewards/bleu_reward_func/mean": 0.06421424448490143, "rewards/bleu_reward_func/std": 0.02574257366359234, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 325.03125, "completions/mean_terminated_length": 251.86956787109375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2344, "grad_norm": 3.4702064990997314, "kl": 0.03289794921875, "learning_rate": 1e-06, "loss": 0.1122, "num_tokens": 3892021.0, "reward": 0.04875369742512703, "reward_std": 0.020287783816456795, "rewards/bleu_reward_func/mean": 0.04875369742512703, "rewards/bleu_reward_func/std": 0.0285445898771286, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 107.375, "completions/mean_terminated_length": 94.32257843017578, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2352, "grad_norm": 25.415559768676758, "kl": 0.23876953125, "learning_rate": 1e-06, "loss": 0.1099, "num_tokens": 3903457.0, "reward": 0.12372880429029465, "reward_std": 0.02668173238635063, "rewards/bleu_reward_func/mean": 0.12372880429029465, "rewards/bleu_reward_func/std": 0.12391357123851776, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 244.09375, "completions/mean_terminated_length": 122.31818389892578, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.236, "grad_norm": 8.319884300231934, "kl": 0.14251708984375, "learning_rate": 1e-06, "loss": -0.034, "num_tokens": 3917028.0, "reward": 0.16006486117839813, "reward_std": 0.02584708109498024, "rewards/bleu_reward_func/mean": 0.16006486117839813, "rewards/bleu_reward_func/std": 0.1484500914812088, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 278.875, "completions/mean_terminated_length": 139.0, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.2368, "grad_norm": 4.291149616241455, "kl": 0.131500244140625, "learning_rate": 1e-06, "loss": -0.192, "num_tokens": 3929400.0, "reward": 0.09954051673412323, "reward_std": 0.03838299959897995, "rewards/bleu_reward_func/mean": 0.09954051673412323, "rewards/bleu_reward_func/std": 0.13533763587474823, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 288.9375, "completions/mean_terminated_length": 187.5454559326172, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.2376, "grad_norm": 5.0546417236328125, "kl": 0.10858154296875, "learning_rate": 1e-06, "loss": -0.0687, "num_tokens": 3942222.0, "reward": 0.16907253861427307, "reward_std": 0.03968513384461403, "rewards/bleu_reward_func/mean": 0.16907253861427307, "rewards/bleu_reward_func/std": 0.10800375789403915, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 309.1875, "completions/mean_terminated_length": 151.44444274902344, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2384, "grad_norm": 8.339001655578613, "kl": 0.1490631103515625, "learning_rate": 1e-06, "loss": -0.0121, "num_tokens": 3954316.0, "reward": 0.06681232899427414, "reward_std": 0.015474791638553143, "rewards/bleu_reward_func/mean": 0.06681232899427414, "rewards/bleu_reward_func/std": 0.06617429107427597, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 285.59375, "completions/mean_terminated_length": 109.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2392, "grad_norm": 3.8715662956237793, "kl": 0.050140380859375, "learning_rate": 1e-06, "loss": 0.1729, "num_tokens": 3967087.0, "reward": 0.1066230833530426, "reward_std": 0.08889298141002655, "rewards/bleu_reward_func/mean": 0.1066230833530426, "rewards/bleu_reward_func/std": 0.14223438501358032, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 120.9375, "completions/mean_terminated_length": 120.9375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.24, "grad_norm": 9.271559715270996, "kl": 0.223388671875, "learning_rate": 1e-06, "loss": 0.0531, "num_tokens": 3977797.0, "reward": 0.09239183366298676, "reward_std": 0.04012807458639145, "rewards/bleu_reward_func/mean": 0.09239183366298676, "rewards/bleu_reward_func/std": 0.07950045168399811, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 202.239990234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.2408, "grad_norm": 8.53159236907959, "kl": 0.18048095703125, "learning_rate": 1e-06, "loss": -0.1823, "num_tokens": 3988157.0, "reward": 0.04499006271362305, "reward_std": 0.015048853121697903, "rewards/bleu_reward_func/mean": 0.04499006271362305, "rewards/bleu_reward_func/std": 0.036676883697509766, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 224.0625, "completions/mean_terminated_length": 157.61538696289062, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2416, "grad_norm": 6.0366997718811035, "kl": 0.099029541015625, "learning_rate": 1e-06, "loss": -0.1824, "num_tokens": 4000135.0, "reward": 0.1630059778690338, "reward_std": 0.04720958322286606, "rewards/bleu_reward_func/mean": 0.1630059778690338, "rewards/bleu_reward_func/std": 0.1834760457277298, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 169.3125, "completions/mean_terminated_length": 90.23077392578125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2424, "grad_norm": 9.543852806091309, "kl": 0.35198974609375, "learning_rate": 1e-06, "loss": -0.2399, "num_tokens": 4009009.0, "reward": 0.06052142754197121, "reward_std": 0.026765264570713043, "rewards/bleu_reward_func/mean": 0.06052142754197121, "rewards/bleu_reward_func/std": 0.052253786474466324, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 365.78125, "completions/mean_terminated_length": 265.7368469238281, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.2432, "grad_norm": 3.007157564163208, "kl": 0.0393524169921875, "learning_rate": 1e-06, "loss": 0.032, "num_tokens": 4023690.0, "reward": 0.025675857439637184, "reward_std": 0.013720525428652763, "rewards/bleu_reward_func/mean": 0.025675857439637184, "rewards/bleu_reward_func/std": 0.022033939138054848, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 158.6875, "completions/mean_terminated_length": 158.6875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.244, "grad_norm": 7.10622501373291, "kl": 0.21661376953125, "learning_rate": 1e-06, "loss": 0.166, "num_tokens": 4033848.0, "reward": 0.19492439925670624, "reward_std": 0.0628402829170227, "rewards/bleu_reward_func/mean": 0.19492439925670624, "rewards/bleu_reward_func/std": 0.22491495311260223, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 345.53125, "completions/mean_terminated_length": 290.04168701171875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.2448, "grad_norm": 4.572328090667725, "kl": 0.099700927734375, "learning_rate": 1e-06, "loss": 0.1016, "num_tokens": 4047897.0, "reward": 0.12647973001003265, "reward_std": 0.03362637385725975, "rewards/bleu_reward_func/mean": 0.12647973001003265, "rewards/bleu_reward_func/std": 0.08024211972951889, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 217.6875, "completions/mean_terminated_length": 175.6428680419922, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2456, "grad_norm": 7.489211082458496, "kl": 0.17584228515625, "learning_rate": 1e-06, "loss": -0.1361, "num_tokens": 4062471.0, "reward": 0.15859398245811462, "reward_std": 0.059820279479026794, "rewards/bleu_reward_func/mean": 0.15859398245811462, "rewards/bleu_reward_func/std": 0.11927466094493866, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 304.0625, "completions/mean_terminated_length": 209.5454559326172, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2464, "grad_norm": 6.605251789093018, "kl": 0.15716552734375, "learning_rate": 1e-06, "loss": 0.1835, "num_tokens": 4079553.0, "reward": 0.048189468681812286, "reward_std": 0.01783904619514942, "rewards/bleu_reward_func/mean": 0.048189468681812286, "rewards/bleu_reward_func/std": 0.037260618060827255, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 150.46875, "completions/mean_terminated_length": 67.03846740722656, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2472, "grad_norm": 20.150175094604492, "kl": 0.31695556640625, "learning_rate": 1e-06, "loss": -0.2639, "num_tokens": 4089536.0, "reward": 0.19017143547534943, "reward_std": 0.06138678267598152, "rewards/bleu_reward_func/mean": 0.19017143547534943, "rewards/bleu_reward_func/std": 0.25128865242004395, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 171.3125, "completions/mean_terminated_length": 136.0689697265625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.248, "grad_norm": 6.626379013061523, "kl": 0.103729248046875, "learning_rate": 1e-06, "loss": -0.1912, "num_tokens": 4100146.0, "reward": 0.08903198689222336, "reward_std": 0.029232412576675415, "rewards/bleu_reward_func/mean": 0.08903198689222336, "rewards/bleu_reward_func/std": 0.09126507490873337, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 216.46875, "completions/mean_terminated_length": 117.95833587646484, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2488, "grad_norm": 5.2524285316467285, "kl": 0.0589141845703125, "learning_rate": 1e-06, "loss": 0.3318, "num_tokens": 4112841.0, "reward": 0.07349678874015808, "reward_std": 0.05337782949209213, "rewards/bleu_reward_func/mean": 0.07349678874015808, "rewards/bleu_reward_func/std": 0.10531707108020782, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 196.17391967773438, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.2496, "grad_norm": 5.209020137786865, "kl": 0.11212158203125, "learning_rate": 1e-06, "loss": -0.1362, "num_tokens": 4125369.0, "reward": 0.1321243941783905, "reward_std": 0.035379908978939056, "rewards/bleu_reward_func/mean": 0.1321243941783905, "rewards/bleu_reward_func/std": 0.12779219448566437, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 397.1875, "completions/mean_terminated_length": 205.83334350585938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2504, "grad_norm": 2.491729974746704, "kl": 0.029266357421875, "learning_rate": 1e-06, "loss": -0.0819, "num_tokens": 4142671.0, "reward": 0.021221335977315903, "reward_std": 0.008927191607654095, "rewards/bleu_reward_func/mean": 0.021221335977315903, "rewards/bleu_reward_func/std": 0.01940017379820347, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 276.8125, "completions/mean_terminated_length": 222.53846740722656, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.2512, "grad_norm": 3.1302947998046875, "kl": 0.030731201171875, "learning_rate": 1e-06, "loss": -0.09, "num_tokens": 4158681.0, "reward": 0.18806447088718414, "reward_std": 0.04276939481496811, "rewards/bleu_reward_func/mean": 0.18806447088718414, "rewards/bleu_reward_func/std": 0.2711097002029419, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 202.71875, "completions/mean_terminated_length": 182.10000610351562, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.252, "grad_norm": 9.11577320098877, "kl": 0.321502685546875, "learning_rate": 1e-06, "loss": 0.2469, "num_tokens": 4168304.0, "reward": 0.17324072122573853, "reward_std": 0.07514998316764832, "rewards/bleu_reward_func/mean": 0.17324072122573853, "rewards/bleu_reward_func/std": 0.15059800446033478, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 310.65625, "completions/mean_terminated_length": 133.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2528, "grad_norm": 4.476902961730957, "kl": 0.22100830078125, "learning_rate": 1e-06, "loss": -0.0695, "num_tokens": 4183237.0, "reward": 0.11044389009475708, "reward_std": 0.04662460461258888, "rewards/bleu_reward_func/mean": 0.11044389009475708, "rewards/bleu_reward_func/std": 0.13189704716205597, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 326.40625, "completions/mean_terminated_length": 264.54168701171875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2536, "grad_norm": 4.724470138549805, "kl": 0.039764404296875, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 4196786.0, "reward": 0.1738719940185547, "reward_std": 0.06735121458768845, "rewards/bleu_reward_func/mean": 0.1738719940185547, "rewards/bleu_reward_func/std": 0.15234871208667755, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 264.65625, "completions/mean_terminated_length": 116.25, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2544, "grad_norm": 7.755268096923828, "kl": 0.23388671875, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 4211319.0, "reward": 0.16174694895744324, "reward_std": 0.04472574219107628, "rewards/bleu_reward_func/mean": 0.16174694895744324, "rewards/bleu_reward_func/std": 0.13533204793930054, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 326.0625, "completions/mean_terminated_length": 228.6666717529297, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.2552, "grad_norm": 3.5100746154785156, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": -0.0174, "num_tokens": 4224641.0, "reward": 0.14605101943016052, "reward_std": 0.039064351469278336, "rewards/bleu_reward_func/mean": 0.14605101943016052, "rewards/bleu_reward_func/std": 0.1437525898218155, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 387.5, "completions/mean_terminated_length": 180.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.256, "grad_norm": 3.499901056289673, "kl": 0.03240966796875, "learning_rate": 1e-06, "loss": -0.101, "num_tokens": 4244041.0, "reward": 0.038129642605781555, "reward_std": 0.0157744400203228, "rewards/bleu_reward_func/mean": 0.038129642605781555, "rewards/bleu_reward_func/std": 0.030961766839027405, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 207.1724090576172, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2568, "grad_norm": 6.800954341888428, "kl": 0.172210693359375, "learning_rate": 1e-06, "loss": -0.2682, "num_tokens": 4257425.0, "reward": 0.08078090846538544, "reward_std": 0.0318281352519989, "rewards/bleu_reward_func/mean": 0.08078090846538544, "rewards/bleu_reward_func/std": 0.060885149985551834, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 34.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2576, "grad_norm": 6.995741367340088, "kl": 0.197662353515625, "learning_rate": 1e-06, "loss": 0.0321, "num_tokens": 4270729.0, "reward": 0.3046156167984009, "reward_std": 0.045112840831279755, "rewards/bleu_reward_func/mean": 0.3046156167984009, "rewards/bleu_reward_func/std": 0.17106564342975616, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 309.0625, "completions/mean_terminated_length": 216.8181915283203, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2584, "grad_norm": 8.159075736999512, "kl": 0.11962890625, "learning_rate": 1e-06, "loss": 0.0599, "num_tokens": 4286907.0, "reward": 0.11749087274074554, "reward_std": 0.04918123036623001, "rewards/bleu_reward_func/mean": 0.11749087274074554, "rewards/bleu_reward_func/std": 0.12518151104450226, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 156.90625, "completions/mean_terminated_length": 156.90625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2592, "grad_norm": 7.079853057861328, "kl": 0.09991455078125, "learning_rate": 1e-06, "loss": 0.0397, "num_tokens": 4295536.0, "reward": 0.11096417158842087, "reward_std": 0.04051455110311508, "rewards/bleu_reward_func/mean": 0.11096417158842087, "rewards/bleu_reward_func/std": 0.1420901119709015, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 49.7599983215332, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.26, "grad_norm": 8.065258026123047, "kl": 0.167816162109375, "learning_rate": 1e-06, "loss": -0.0243, "num_tokens": 4306404.0, "reward": 0.13756218552589417, "reward_std": 0.02154640108346939, "rewards/bleu_reward_func/mean": 0.13756218552589417, "rewards/bleu_reward_func/std": 0.14523112773895264, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 385.75, "completions/mean_terminated_length": 310.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.2608, "grad_norm": 2.441365957260132, "kl": 0.019775390625, "learning_rate": 1e-06, "loss": 0.024, "num_tokens": 4323836.0, "reward": 0.023768192157149315, "reward_std": 0.009069718420505524, "rewards/bleu_reward_func/mean": 0.023768192157149315, "rewards/bleu_reward_func/std": 0.029040560126304626, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 319.46875, "completions/mean_terminated_length": 203.9499969482422, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.2616, "grad_norm": 5.7556071281433105, "kl": 0.091705322265625, "learning_rate": 1e-06, "loss": 0.0042, "num_tokens": 4338667.0, "reward": 0.07871399819850922, "reward_std": 0.03653344139456749, "rewards/bleu_reward_func/mean": 0.07871399819850922, "rewards/bleu_reward_func/std": 0.06572794169187546, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 264.84375, "completions/mean_terminated_length": 152.5, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2624, "grad_norm": 6.231250286102295, "kl": 0.1138916015625, "learning_rate": 1e-06, "loss": -0.0458, "num_tokens": 4351270.0, "reward": 0.16190959513187408, "reward_std": 0.02650507725775242, "rewards/bleu_reward_func/mean": 0.16190959513187408, "rewards/bleu_reward_func/std": 0.15018552541732788, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 350.625, "completions/mean_terminated_length": 277.2727355957031, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.2632, "grad_norm": 2.828697681427002, "kl": 0.02972412109375, "learning_rate": 1e-06, "loss": 0.0748, "num_tokens": 4366018.0, "reward": 0.07461819052696228, "reward_std": 0.034676797688007355, "rewards/bleu_reward_func/mean": 0.07461819052696228, "rewards/bleu_reward_func/std": 0.10171358287334442, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 330.96875, "completions/mean_terminated_length": 171.23529052734375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.264, "grad_norm": 7.326402187347412, "kl": 0.0977630615234375, "learning_rate": 1e-06, "loss": 0.2768, "num_tokens": 4378353.0, "reward": 0.07485680282115936, "reward_std": 0.04837151616811752, "rewards/bleu_reward_func/mean": 0.07485680282115936, "rewards/bleu_reward_func/std": 0.04874453693628311, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 65.40625, "completions/mean_terminated_length": 65.40625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2648, "grad_norm": 12.08074951171875, "kl": 0.335693359375, "learning_rate": 1e-06, "loss": 0.1573, "num_tokens": 4384062.0, "reward": 0.19588544964790344, "reward_std": 0.09824244678020477, "rewards/bleu_reward_func/mean": 0.19588544964790344, "rewards/bleu_reward_func/std": 0.16972649097442627, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 203.0800018310547, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.2656, "grad_norm": 4.561427593231201, "kl": 0.039154052734375, "learning_rate": 1e-06, "loss": -0.0917, "num_tokens": 4394427.0, "reward": 0.06531640887260437, "reward_std": 0.018873782828450203, "rewards/bleu_reward_func/mean": 0.06531640887260437, "rewards/bleu_reward_func/std": 0.059104837477207184, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 155.71429443359375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2664, "grad_norm": 6.5239057540893555, "kl": 0.169952392578125, "learning_rate": 1e-06, "loss": 0.0545, "num_tokens": 4409739.0, "reward": 0.23698079586029053, "reward_std": 0.08829502761363983, "rewards/bleu_reward_func/mean": 0.23698079586029053, "rewards/bleu_reward_func/std": 0.2539888322353363, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 142.1875, "completions/mean_terminated_length": 103.93103790283203, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.2672, "grad_norm": 6.988838195800781, "kl": 0.192413330078125, "learning_rate": 1e-06, "loss": 0.0242, "num_tokens": 4420033.0, "reward": 0.18931233882904053, "reward_std": 0.06329823285341263, "rewards/bleu_reward_func/mean": 0.18931233882904053, "rewards/bleu_reward_func/std": 0.16267651319503784, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 149.71875, "completions/mean_terminated_length": 66.11538696289062, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.268, "grad_norm": 16.95305061340332, "kl": 0.3260498046875, "learning_rate": 1e-06, "loss": 0.3727, "num_tokens": 4429376.0, "reward": 0.11154920607805252, "reward_std": 0.06479852646589279, "rewards/bleu_reward_func/mean": 0.11154920607805252, "rewards/bleu_reward_func/std": 0.07707681506872177, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 185.51724243164062, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2688, "grad_norm": 12.891951560974121, "kl": 0.154815673828125, "learning_rate": 1e-06, "loss": 0.1506, "num_tokens": 4438340.0, "reward": 0.11881305277347565, "reward_std": 0.04300341382622719, "rewards/bleu_reward_func/mean": 0.11881305277347565, "rewards/bleu_reward_func/std": 0.11628168076276779, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 259.375, "completions/mean_terminated_length": 127.04762268066406, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2696, "grad_norm": 8.3147554397583, "kl": 0.0980682373046875, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 4451328.0, "reward": 0.11791149526834488, "reward_std": 0.02945806086063385, "rewards/bleu_reward_func/mean": 0.11791149526834488, "rewards/bleu_reward_func/std": 0.06387177854776382, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 159.28125, "completions/mean_terminated_length": 60.52000045776367, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.2704, "grad_norm": 6.874416828155518, "kl": 0.235107421875, "learning_rate": 1e-06, "loss": 0.1172, "num_tokens": 4461785.0, "reward": 0.18331755697727203, "reward_std": 0.05733542889356613, "rewards/bleu_reward_func/mean": 0.18331755697727203, "rewards/bleu_reward_func/std": 0.17218343913555145, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 206.1875, "completions/mean_terminated_length": 120.55999755859375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2712, "grad_norm": 7.444963455200195, "kl": 0.08843994140625, "learning_rate": 1e-06, "loss": 0.3417, "num_tokens": 4471031.0, "reward": 0.08221863210201263, "reward_std": 0.030037853866815567, "rewards/bleu_reward_func/mean": 0.08221863210201263, "rewards/bleu_reward_func/std": 0.05527469143271446, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 256.84375, "completions/mean_terminated_length": 197.9615478515625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.272, "grad_norm": 6855.86328125, "kl": 1.03955078125, "learning_rate": 1e-06, "loss": 0.0411, "num_tokens": 4485834.0, "reward": 0.13405509293079376, "reward_std": 0.03707335144281387, "rewards/bleu_reward_func/mean": 0.13405509293079376, "rewards/bleu_reward_func/std": 0.15687085688114166, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 141.53846740722656, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2728, "grad_norm": 8.717283248901367, "kl": 0.128326416015625, "learning_rate": 1e-06, "loss": -0.0381, "num_tokens": 4496202.0, "reward": 0.0755915641784668, "reward_std": 0.029588045552372932, "rewards/bleu_reward_func/mean": 0.0755915641784668, "rewards/bleu_reward_func/std": 0.05914263799786568, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 213.15625, "completions/mean_terminated_length": 129.47999572753906, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2736, "grad_norm": 9.269394874572754, "kl": 0.21234130859375, "learning_rate": 1e-06, "loss": -0.0787, "num_tokens": 4505447.0, "reward": 0.11310072988271713, "reward_std": 0.035067904740571976, "rewards/bleu_reward_func/mean": 0.11310072988271713, "rewards/bleu_reward_func/std": 0.10819036513566971, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 353.90625, "completions/mean_terminated_length": 174.73333740234375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2744, "grad_norm": 6.147165298461914, "kl": 0.0384521484375, "learning_rate": 1e-06, "loss": -0.0413, "num_tokens": 4519052.0, "reward": 0.06785966455936432, "reward_std": 0.039666250348091125, "rewards/bleu_reward_func/mean": 0.06785966455936432, "rewards/bleu_reward_func/std": 0.059012189507484436, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 370.84375, "completions/mean_terminated_length": 261.0555725097656, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2752, "grad_norm": 6.257096767425537, "kl": 0.170440673828125, "learning_rate": 1e-06, "loss": 0.0397, "num_tokens": 4533975.0, "reward": 0.05020497739315033, "reward_std": 0.009127253666520119, "rewards/bleu_reward_func/mean": 0.05020497739315033, "rewards/bleu_reward_func/std": 0.04745229333639145, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 260.1875, "completions/mean_terminated_length": 64.33333587646484, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.276, "grad_norm": 8.694131851196289, "kl": 0.40167236328125, "learning_rate": 1e-06, "loss": 0.124, "num_tokens": 4548765.0, "reward": 0.17815490067005157, "reward_std": 0.04761611297726631, "rewards/bleu_reward_func/mean": 0.17815490067005157, "rewards/bleu_reward_func/std": 0.22018791735172272, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 334.78125, "completions/mean_terminated_length": 275.7083435058594, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2768, "grad_norm": 6.1226325035095215, "kl": 0.105133056640625, "learning_rate": 1e-06, "loss": -0.0251, "num_tokens": 4565158.0, "reward": 0.09645688533782959, "reward_std": 0.0746307447552681, "rewards/bleu_reward_func/mean": 0.09645688533782959, "rewards/bleu_reward_func/std": 0.1715475171804428, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 98.46875, "completions/mean_terminated_length": 98.46875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.2776, "grad_norm": 8.647904396057129, "kl": 0.328857421875, "learning_rate": 1e-06, "loss": -0.0264, "num_tokens": 4576637.0, "reward": 0.3595752716064453, "reward_std": 0.09626303613185883, "rewards/bleu_reward_func/mean": 0.3595752716064453, "rewards/bleu_reward_func/std": 0.293544203042984, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 191.28125, "completions/mean_terminated_length": 84.375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2784, "grad_norm": 7.7827630043029785, "kl": 0.24041748046875, "learning_rate": 1e-06, "loss": 0.0475, "num_tokens": 4586230.0, "reward": 0.2051679939031601, "reward_std": 0.029646433889865875, "rewards/bleu_reward_func/mean": 0.2051679939031601, "rewards/bleu_reward_func/std": 0.20678655803203583, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 320.5, "completions/mean_terminated_length": 256.66668701171875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.2792, "grad_norm": 8.593353271484375, "kl": 0.149017333984375, "learning_rate": 1e-06, "loss": -0.0651, "num_tokens": 4603070.0, "reward": 0.1438911259174347, "reward_std": 0.06431536376476288, "rewards/bleu_reward_func/mean": 0.1438911259174347, "rewards/bleu_reward_func/std": 0.22814705967903137, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 381.96875, "completions/mean_terminated_length": 191.92308044433594, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.28, "grad_norm": 2.2874648571014404, "kl": 0.023284912109375, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 4619757.0, "reward": 0.19660863280296326, "reward_std": 0.08571420609951019, "rewards/bleu_reward_func/mean": 0.19660863280296326, "rewards/bleu_reward_func/std": 0.2662343680858612, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 380.53125, "completions/mean_terminated_length": 290.5789489746094, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2808, "grad_norm": 2.8600640296936035, "kl": 0.02679443359375, "learning_rate": 1e-06, "loss": 0.082, "num_tokens": 4636806.0, "reward": 0.05401962995529175, "reward_std": 0.019372381269931793, "rewards/bleu_reward_func/mean": 0.05401962995529175, "rewards/bleu_reward_func/std": 0.026677841320633888, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 328.125, "completions/mean_terminated_length": 244.5454559326172, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2816, "grad_norm": 6.117258548736572, "kl": 0.10986328125, "learning_rate": 1e-06, "loss": 0.162, "num_tokens": 4649338.0, "reward": 0.12430500984191895, "reward_std": 0.046015314757823944, "rewards/bleu_reward_func/mean": 0.12430500984191895, "rewards/bleu_reward_func/std": 0.11290674656629562, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 335.15625, "completions/mean_terminated_length": 179.11764526367188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.2824, "grad_norm": 9.883430480957031, "kl": 0.134429931640625, "learning_rate": 1e-06, "loss": -0.0897, "num_tokens": 4664823.0, "reward": 0.10318648815155029, "reward_std": 0.040948014706373215, "rewards/bleu_reward_func/mean": 0.10318648815155029, "rewards/bleu_reward_func/std": 0.098084457218647, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 213.96875, "completions/mean_terminated_length": 204.35482788085938, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.2832, "grad_norm": 3.7569406032562256, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.276, "num_tokens": 4673198.0, "reward": 0.02880779653787613, "reward_std": 0.02136135660111904, "rewards/bleu_reward_func/mean": 0.02880779653787613, "rewards/bleu_reward_func/std": 0.031262028962373734, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 382.34375, "completions/mean_terminated_length": 134.8181915283203, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.284, "grad_norm": 5.402606010437012, "kl": 0.059539794921875, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 4690033.0, "reward": 0.11326944082975388, "reward_std": 0.04008851572871208, "rewards/bleu_reward_func/mean": 0.11326944082975388, "rewards/bleu_reward_func/std": 0.1632446050643921, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.2848, "grad_norm": 15.920856475830078, "kl": 0.2000732421875, "learning_rate": 1e-06, "loss": 0.1343, "num_tokens": 4696953.0, "reward": 0.1625998467206955, "reward_std": 0.10141640901565552, "rewards/bleu_reward_func/mean": 0.1625998467206955, "rewards/bleu_reward_func/std": 0.12067051976919174, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 103.71875, "completions/mean_terminated_length": 103.71875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2856, "grad_norm": 32.86006546020508, "kl": 0.153564453125, "learning_rate": 1e-06, "loss": 0.1497, "num_tokens": 4705000.0, "reward": 0.05853947252035141, "reward_std": 0.014492938295006752, "rewards/bleu_reward_func/mean": 0.05853947252035141, "rewards/bleu_reward_func/std": 0.02192818373441696, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 110.96875, "completions/mean_terminated_length": 110.96875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.2864, "grad_norm": 8.785351753234863, "kl": 0.1767578125, "learning_rate": 1e-06, "loss": 0.0291, "num_tokens": 4713815.0, "reward": 0.256367951631546, "reward_std": 0.06547890603542328, "rewards/bleu_reward_func/mean": 0.256367951631546, "rewards/bleu_reward_func/std": 0.2225809097290039, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 333.21875, "completions/mean_terminated_length": 175.47059631347656, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.2872, "grad_norm": 3.714874744415283, "kl": 0.0813140869140625, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 4732606.0, "reward": 0.08705547451972961, "reward_std": 0.02976841665804386, "rewards/bleu_reward_func/mean": 0.08705547451972961, "rewards/bleu_reward_func/std": 0.041370097547769547, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 147.61289978027344, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.288, "grad_norm": 7.568475723266602, "kl": 0.069976806640625, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 4742086.0, "reward": 0.05895683914422989, "reward_std": 0.036796510219573975, "rewards/bleu_reward_func/mean": 0.05895683914422989, "rewards/bleu_reward_func/std": 0.06153297796845436, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 303.34375, "completions/mean_terminated_length": 221.69566345214844, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.2888, "grad_norm": 3.495642900466919, "kl": 0.033843994140625, "learning_rate": 1e-06, "loss": 0.1195, "num_tokens": 4755153.0, "reward": 0.024642691016197205, "reward_std": 0.00707631791010499, "rewards/bleu_reward_func/mean": 0.024642691016197205, "rewards/bleu_reward_func/std": 0.01350654847919941, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 349.59375, "completions/mean_terminated_length": 238.4736785888672, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.2896, "grad_norm": 3.2497663497924805, "kl": 0.032928466796875, "learning_rate": 1e-06, "loss": 0.0909, "num_tokens": 4768724.0, "reward": 0.06024404242634773, "reward_std": 0.029051221907138824, "rewards/bleu_reward_func/mean": 0.06024404242634773, "rewards/bleu_reward_func/std": 0.05113474279642105, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 308.21875, "completions/mean_terminated_length": 201.4761962890625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.2904, "grad_norm": 3.932180643081665, "kl": 0.0535736083984375, "learning_rate": 1e-06, "loss": -0.1216, "num_tokens": 4784611.0, "reward": 0.10957776010036469, "reward_std": 0.018995165824890137, "rewards/bleu_reward_func/mean": 0.10957776010036469, "rewards/bleu_reward_func/std": 0.12744034826755524, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 315.15625, "completions/mean_terminated_length": 260.0400085449219, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2912, "grad_norm": 3.873363971710205, "kl": 0.04693603515625, "learning_rate": 1e-06, "loss": 0.1157, "num_tokens": 4798600.0, "reward": 0.06850136816501617, "reward_std": 0.03206296265125275, "rewards/bleu_reward_func/mean": 0.06850136816501617, "rewards/bleu_reward_func/std": 0.06299194693565369, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 301.28125, "completions/mean_terminated_length": 205.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.292, "grad_norm": 3.491849184036255, "kl": 0.050079345703125, "learning_rate": 1e-06, "loss": 0.1634, "num_tokens": 4812193.0, "reward": 0.0632539913058281, "reward_std": 0.04620906710624695, "rewards/bleu_reward_func/mean": 0.0632539913058281, "rewards/bleu_reward_func/std": 0.08490858227014542, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 320.40625, "completions/mean_terminated_length": 266.7599792480469, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.2928, "grad_norm": 10.243452072143555, "kl": 0.1219482421875, "learning_rate": 1e-06, "loss": 0.0432, "num_tokens": 4824134.0, "reward": 0.0788659006357193, "reward_std": 0.019495027139782906, "rewards/bleu_reward_func/mean": 0.0788659006357193, "rewards/bleu_reward_func/std": 0.05461956560611725, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 357.40625, "completions/mean_terminated_length": 296.9130554199219, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2936, "grad_norm": 2.715989351272583, "kl": 0.036712646484375, "learning_rate": 1e-06, "loss": -0.1141, "num_tokens": 4839219.0, "reward": 0.1387082040309906, "reward_std": 0.025043122470378876, "rewards/bleu_reward_func/mean": 0.1387082040309906, "rewards/bleu_reward_func/std": 0.14657536149024963, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 418.0, "completions/mean_terminated_length": 261.3333435058594, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.2944, "grad_norm": 2.414018154144287, "kl": 0.029937744140625, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 4857699.0, "reward": 0.06751300394535065, "reward_std": 0.05967854708433151, "rewards/bleu_reward_func/mean": 0.06751300394535065, "rewards/bleu_reward_func/std": 0.08448994904756546, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 50.09375, "completions/mean_terminated_length": 50.09375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.2952, "grad_norm": 12.07331657409668, "kl": 0.331298828125, "learning_rate": 1e-06, "loss": -0.167, "num_tokens": 4865766.0, "reward": 0.2235146164894104, "reward_std": 0.06765347719192505, "rewards/bleu_reward_func/mean": 0.2235146164894104, "rewards/bleu_reward_func/std": 0.15006797015666962, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 155.15789794921875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.296, "grad_norm": 6.003938674926758, "kl": 0.062225341796875, "learning_rate": 1e-06, "loss": 0.0295, "num_tokens": 4883786.0, "reward": 0.09686341136693954, "reward_std": 0.04255010187625885, "rewards/bleu_reward_func/mean": 0.09686341136693954, "rewards/bleu_reward_func/std": 0.11752825975418091, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 381.84375, "completions/mean_terminated_length": 338.4583435058594, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.2968, "grad_norm": 2.782743215560913, "kl": 0.041107177734375, "learning_rate": 1e-06, "loss": -0.0931, "num_tokens": 4898397.0, "reward": 0.06518180668354034, "reward_std": 0.017261603847146034, "rewards/bleu_reward_func/mean": 0.06518180668354034, "rewards/bleu_reward_func/std": 0.07592527568340302, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 191.71875, "completions/mean_terminated_length": 170.36666870117188, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.2976, "grad_norm": 6.834630489349365, "kl": 0.155029296875, "learning_rate": 1e-06, "loss": -0.0674, "num_tokens": 4907564.0, "reward": 0.0751166045665741, "reward_std": 0.03539106994867325, "rewards/bleu_reward_func/mean": 0.0751166045665741, "rewards/bleu_reward_func/std": 0.03759034350514412, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 167.65625, "completions/mean_terminated_length": 156.5483856201172, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.2984, "grad_norm": 9.550743103027344, "kl": 0.1636962890625, "learning_rate": 1e-06, "loss": 0.1533, "num_tokens": 4916969.0, "reward": 0.12691722810268402, "reward_std": 0.019398069009184837, "rewards/bleu_reward_func/mean": 0.12691722810268402, "rewards/bleu_reward_func/std": 0.14723701775074005, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 267.6875, "completions/mean_terminated_length": 199.27999877929688, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.2992, "grad_norm": 5.277988433837891, "kl": 0.143890380859375, "learning_rate": 1e-06, "loss": -0.2024, "num_tokens": 4930367.0, "reward": 0.21388903260231018, "reward_std": 0.0590648353099823, "rewards/bleu_reward_func/mean": 0.21388903260231018, "rewards/bleu_reward_func/std": 0.2627076506614685, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 337.59375, "completions/mean_terminated_length": 246.23809814453125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3, "grad_norm": 10.797468185424805, "kl": 0.12530517578125, "learning_rate": 1e-06, "loss": -0.06, "num_tokens": 4948002.0, "reward": 0.1380675733089447, "reward_std": 0.049179110676050186, "rewards/bleu_reward_func/mean": 0.1380675733089447, "rewards/bleu_reward_func/std": 0.14962899684906006, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 163.1666717529297, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3008, "grad_norm": 6.259679794311523, "kl": 0.1361083984375, "learning_rate": 1e-06, "loss": -0.0324, "num_tokens": 4963942.0, "reward": 0.2779002785682678, "reward_std": 0.049215167760849, "rewards/bleu_reward_func/mean": 0.2779002785682678, "rewards/bleu_reward_func/std": 0.247111514210701, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 228.96875, "completions/mean_terminated_length": 188.5357208251953, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3016, "grad_norm": 9.751809120178223, "kl": 0.12164306640625, "learning_rate": 1e-06, "loss": -0.0784, "num_tokens": 4974437.0, "reward": 0.12611877918243408, "reward_std": 0.05333450064063072, "rewards/bleu_reward_func/mean": 0.12611877918243408, "rewards/bleu_reward_func/std": 0.11847065389156342, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 428.34375, "completions/mean_terminated_length": 306.0769348144531, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3024, "grad_norm": 1.8198633193969727, "kl": 0.025543212890625, "learning_rate": 1e-06, "loss": 0.0668, "num_tokens": 4993880.0, "reward": 0.07207944989204407, "reward_std": 0.019526129588484764, "rewards/bleu_reward_func/mean": 0.07207944989204407, "rewards/bleu_reward_func/std": 0.06778865307569504, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 63.40625, "completions/mean_terminated_length": 63.40625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3032, "grad_norm": 6.933629512786865, "kl": 0.13250732421875, "learning_rate": 1e-06, "loss": 0.3354, "num_tokens": 5001909.0, "reward": 0.12609761953353882, "reward_std": 0.07611958682537079, "rewards/bleu_reward_func/mean": 0.12609761953353882, "rewards/bleu_reward_func/std": 0.09586605429649353, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 148.44444274902344, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.304, "grad_norm": 8.042766571044922, "kl": 0.155029296875, "learning_rate": 1e-06, "loss": 0.1587, "num_tokens": 5012533.0, "reward": 0.18430504202842712, "reward_std": 0.09831003099679947, "rewards/bleu_reward_func/mean": 0.18430504202842712, "rewards/bleu_reward_func/std": 0.1858755648136139, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 333.46875, "completions/mean_terminated_length": 211.3157958984375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3048, "grad_norm": 19.682376861572266, "kl": 0.1099853515625, "learning_rate": 1e-06, "loss": 0.015, "num_tokens": 5028972.0, "reward": 0.09155917167663574, "reward_std": 0.012800632044672966, "rewards/bleu_reward_func/mean": 0.09155917167663574, "rewards/bleu_reward_func/std": 0.1374584585428238, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 301.6875, "completions/mean_terminated_length": 253.1538543701172, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3056, "grad_norm": 3.2053592205047607, "kl": 0.051788330078125, "learning_rate": 1e-06, "loss": 0.2446, "num_tokens": 5041090.0, "reward": 0.06323020905256271, "reward_std": 0.032996732741594315, "rewards/bleu_reward_func/mean": 0.06323020905256271, "rewards/bleu_reward_func/std": 0.05562639981508255, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 160.92308044433594, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3064, "grad_norm": 7.353909492492676, "kl": 0.07830810546875, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 5057798.0, "reward": 0.1571401059627533, "reward_std": 0.02875007688999176, "rewards/bleu_reward_func/mean": 0.1571401059627533, "rewards/bleu_reward_func/std": 0.20372198522090912, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 316.84375, "completions/mean_terminated_length": 199.75, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3072, "grad_norm": 3.984431743621826, "kl": 0.066986083984375, "learning_rate": 1e-06, "loss": 0.0376, "num_tokens": 5073449.0, "reward": 0.03315318748354912, "reward_std": 0.038507476449012756, "rewards/bleu_reward_func/mean": 0.03315318748354912, "rewards/bleu_reward_func/std": 0.06562887132167816, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 319.375, "completions/mean_terminated_length": 126.75, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.308, "grad_norm": 6.559665203094482, "kl": 0.0677490234375, "learning_rate": 1e-06, "loss": 0.3157, "num_tokens": 5087821.0, "reward": 0.06230534613132477, "reward_std": 0.03765605762600899, "rewards/bleu_reward_func/mean": 0.06230534613132477, "rewards/bleu_reward_func/std": 0.07213454693555832, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 319.15625, "completions/mean_terminated_length": 265.1600036621094, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3088, "grad_norm": 3.6326193809509277, "kl": 0.05596923828125, "learning_rate": 1e-06, "loss": -0.0929, "num_tokens": 5100618.0, "reward": 0.04398781806230545, "reward_std": 0.02026546560227871, "rewards/bleu_reward_func/mean": 0.04398781806230545, "rewards/bleu_reward_func/std": 0.042056936770677567, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 181.3333282470703, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.3096, "grad_norm": 4.189205646514893, "kl": 0.0577392578125, "learning_rate": 1e-06, "loss": 0.0616, "num_tokens": 5118850.0, "reward": 0.10049895197153091, "reward_std": 0.035130538046360016, "rewards/bleu_reward_func/mean": 0.10049895197153091, "rewards/bleu_reward_func/std": 0.0897059291601181, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 357.125, "completions/mean_terminated_length": 251.15789794921875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3104, "grad_norm": 8.503087997436523, "kl": 0.1228790283203125, "learning_rate": 1e-06, "loss": 0.1152, "num_tokens": 5131574.0, "reward": 0.10157294571399689, "reward_std": 0.05235150083899498, "rewards/bleu_reward_func/mean": 0.10157294571399689, "rewards/bleu_reward_func/std": 0.11832693964242935, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 261.0625, "completions/mean_terminated_length": 244.33334350585938, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3112, "grad_norm": 7.511518478393555, "kl": 0.0782470703125, "learning_rate": 1e-06, "loss": 0.1551, "num_tokens": 5142288.0, "reward": 0.05309104174375534, "reward_std": 0.0195770300924778, "rewards/bleu_reward_func/mean": 0.05309104174375534, "rewards/bleu_reward_func/std": 0.03859832510352135, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 77.8125, "completions/mean_terminated_length": 77.8125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.312, "grad_norm": 7.358268737792969, "kl": 0.1142578125, "learning_rate": 1e-06, "loss": 0.0692, "num_tokens": 5147226.0, "reward": 0.2647009789943695, "reward_std": 0.0788542777299881, "rewards/bleu_reward_func/mean": 0.2647009789943695, "rewards/bleu_reward_func/std": 0.3669854998588562, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 331.1875, "completions/mean_terminated_length": 126.26667022705078, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3128, "grad_norm": 6.546727180480957, "kl": 0.131866455078125, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 5162552.0, "reward": 0.06478870660066605, "reward_std": 0.016362179070711136, "rewards/bleu_reward_func/mean": 0.06478870660066605, "rewards/bleu_reward_func/std": 0.07661883533000946, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 305.5, "completions/mean_terminated_length": 211.63636779785156, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3136, "grad_norm": 3.7394042015075684, "kl": 0.0411224365234375, "learning_rate": 1e-06, "loss": -0.0472, "num_tokens": 5174632.0, "reward": 0.07655475288629532, "reward_std": 0.04063459113240242, "rewards/bleu_reward_func/mean": 0.07655475288629532, "rewards/bleu_reward_func/std": 0.05244217440485954, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 299.4375, "completions/mean_terminated_length": 239.9199981689453, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.3144, "grad_norm": 3.130519151687622, "kl": 0.036407470703125, "learning_rate": 1e-06, "loss": 0.0573, "num_tokens": 5189038.0, "reward": 0.08177624642848969, "reward_std": 0.03700428456068039, "rewards/bleu_reward_func/mean": 0.08177624642848969, "rewards/bleu_reward_func/std": 0.07332108914852142, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 152.53334045410156, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3152, "grad_norm": 6.235530853271484, "kl": 0.119140625, "learning_rate": 1e-06, "loss": -0.1315, "num_tokens": 5199614.0, "reward": 0.08668357878923416, "reward_std": 0.029862932860851288, "rewards/bleu_reward_func/mean": 0.08668357878923416, "rewards/bleu_reward_func/std": 0.04458598420023918, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 303.21875, "completions/mean_terminated_length": 140.8333282470703, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.316, "grad_norm": 3.7761735916137695, "kl": 0.051483154296875, "learning_rate": 1e-06, "loss": 0.4336, "num_tokens": 5216893.0, "reward": 0.04373088479042053, "reward_std": 0.025996902957558632, "rewards/bleu_reward_func/mean": 0.04373088479042053, "rewards/bleu_reward_func/std": 0.035521000623703, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 102.375, "completions/mean_terminated_length": 89.16128540039062, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3168, "grad_norm": 9.422346115112305, "kl": 0.20269775390625, "learning_rate": 1e-06, "loss": -0.3887, "num_tokens": 5222225.0, "reward": 0.0936415046453476, "reward_std": 0.07821927219629288, "rewards/bleu_reward_func/mean": 0.0936415046453476, "rewards/bleu_reward_func/std": 0.1016775444149971, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 127.85185241699219, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3176, "grad_norm": 16.8355712890625, "kl": 0.111968994140625, "learning_rate": 1e-06, "loss": 0.2515, "num_tokens": 5234477.0, "reward": 0.2821354866027832, "reward_std": 0.16070716083049774, "rewards/bleu_reward_func/mean": 0.2821354866027832, "rewards/bleu_reward_func/std": 0.34524035453796387, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 169.0370330810547, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3184, "grad_norm": 4.937644004821777, "kl": 0.0895843505859375, "learning_rate": 1e-06, "loss": 0.1443, "num_tokens": 5243161.0, "reward": 0.04823939502239227, "reward_std": 0.020888181403279305, "rewards/bleu_reward_func/mean": 0.04823939502239227, "rewards/bleu_reward_func/std": 0.032690465450286865, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 330.5625, "completions/mean_terminated_length": 304.64288330078125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.3192, "grad_norm": 2.7899651527404785, "kl": 0.028900146484375, "learning_rate": 1e-06, "loss": -0.0361, "num_tokens": 5257211.0, "reward": 0.10274805128574371, "reward_std": 0.03329307958483696, "rewards/bleu_reward_func/mean": 0.10274805128574371, "rewards/bleu_reward_func/std": 0.08635566383600235, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 125.5714340209961, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.32, "grad_norm": 9.990334510803223, "kl": 0.19793701171875, "learning_rate": 1e-06, "loss": 0.0415, "num_tokens": 5265263.0, "reward": 0.13340914249420166, "reward_std": 0.06052035093307495, "rewards/bleu_reward_func/mean": 0.13340914249420166, "rewards/bleu_reward_func/std": 0.12332285940647125, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 87.25, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3208, "grad_norm": 5.343194007873535, "kl": 0.0823516845703125, "learning_rate": 1e-06, "loss": 0.1588, "num_tokens": 5279491.0, "reward": 0.04100114479660988, "reward_std": 0.021917924284934998, "rewards/bleu_reward_func/mean": 0.04100114479660988, "rewards/bleu_reward_func/std": 0.059245530515909195, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 363.96875, "completions/mean_terminated_length": 296.68182373046875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.3216, "grad_norm": 2.502444267272949, "kl": 0.0249786376953125, "learning_rate": 1e-06, "loss": -0.1811, "num_tokens": 5298618.0, "reward": 0.06452260166406631, "reward_std": 0.043596021831035614, "rewards/bleu_reward_func/mean": 0.06452260166406631, "rewards/bleu_reward_func/std": 0.0457596592605114, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 186.8125, "completions/mean_terminated_length": 153.1724090576172, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.3224, "grad_norm": 6.430903434753418, "kl": 0.0926513671875, "learning_rate": 1e-06, "loss": 0.1091, "num_tokens": 5308788.0, "reward": 0.1375400573015213, "reward_std": 0.044691912829875946, "rewards/bleu_reward_func/mean": 0.1375400573015213, "rewards/bleu_reward_func/std": 0.1667727530002594, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 261.375, "completions/mean_terminated_length": 163.30435180664062, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3232, "grad_norm": 7.348942279815674, "kl": 0.114410400390625, "learning_rate": 1e-06, "loss": 0.124, "num_tokens": 5325816.0, "reward": 0.29955723881721497, "reward_std": 0.09420829266309738, "rewards/bleu_reward_func/mean": 0.29955723881721497, "rewards/bleu_reward_func/std": 0.27135762572288513, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 313.125, "completions/mean_terminated_length": 284.71429443359375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.324, "grad_norm": 5.8108601570129395, "kl": 0.0474853515625, "learning_rate": 1e-06, "loss": -0.0256, "num_tokens": 5339252.0, "reward": 0.125982865691185, "reward_std": 0.03331389278173447, "rewards/bleu_reward_func/mean": 0.125982865691185, "rewards/bleu_reward_func/std": 0.07514968514442444, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 221.1875, "completions/mean_terminated_length": 154.07693481445312, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.3248, "grad_norm": 6.334237098693848, "kl": 0.167236328125, "learning_rate": 1e-06, "loss": 0.2154, "num_tokens": 5350538.0, "reward": 0.12314164638519287, "reward_std": 0.034954577684402466, "rewards/bleu_reward_func/mean": 0.12314164638519287, "rewards/bleu_reward_func/std": 0.11711690574884415, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 182.96875, "completions/mean_terminated_length": 90.83999633789062, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3256, "grad_norm": 6.83364200592041, "kl": 0.09881591796875, "learning_rate": 1e-06, "loss": 0.2224, "num_tokens": 5364465.0, "reward": 0.23839128017425537, "reward_std": 0.09448365867137909, "rewards/bleu_reward_func/mean": 0.23839128017425537, "rewards/bleu_reward_func/std": 0.17264093458652496, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 251.59375, "completions/mean_terminated_length": 164.7916717529297, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3264, "grad_norm": 5.291790962219238, "kl": 0.120269775390625, "learning_rate": 1e-06, "loss": 0.1051, "num_tokens": 5379212.0, "reward": 0.07936831563711166, "reward_std": 0.026489000767469406, "rewards/bleu_reward_func/mean": 0.07936831563711166, "rewards/bleu_reward_func/std": 0.04656874015927315, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 395.90625, "completions/mean_terminated_length": 279.8125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3272, "grad_norm": 2.7209274768829346, "kl": 0.033355712890625, "learning_rate": 1e-06, "loss": -0.0145, "num_tokens": 5395369.0, "reward": 0.05327831208705902, "reward_std": 0.020644793286919594, "rewards/bleu_reward_func/mean": 0.05327831208705902, "rewards/bleu_reward_func/std": 0.044744666665792465, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 279.125, "completions/mean_terminated_length": 157.14285278320312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.328, "grad_norm": 6.835958003997803, "kl": 0.17645263671875, "learning_rate": 1e-06, "loss": 0.1004, "num_tokens": 5408861.0, "reward": 0.15895725786685944, "reward_std": 0.053282976150512695, "rewards/bleu_reward_func/mean": 0.15895725786685944, "rewards/bleu_reward_func/std": 0.1344875991344452, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 124.625, "completions/mean_terminated_length": 112.1290283203125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3288, "grad_norm": 7.9765801429748535, "kl": 0.12908935546875, "learning_rate": 1e-06, "loss": 0.0791, "num_tokens": 5422569.0, "reward": 0.29637736082077026, "reward_std": 0.07562527060508728, "rewards/bleu_reward_func/mean": 0.29637736082077026, "rewards/bleu_reward_func/std": 0.1916900873184204, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 242.6875, "completions/mean_terminated_length": 180.53846740722656, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.3296, "grad_norm": 5.444021701812744, "kl": 0.079376220703125, "learning_rate": 1e-06, "loss": -0.0428, "num_tokens": 5432847.0, "reward": 0.1152123510837555, "reward_std": 0.07390551269054413, "rewards/bleu_reward_func/mean": 0.1152123510837555, "rewards/bleu_reward_func/std": 0.14451570808887482, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 329.4375, "completions/mean_terminated_length": 204.5263214111328, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3304, "grad_norm": 14.007586479187012, "kl": 0.074188232421875, "learning_rate": 1e-06, "loss": 0.0903, "num_tokens": 5451693.0, "reward": 0.13860949873924255, "reward_std": 0.032740939408540726, "rewards/bleu_reward_func/mean": 0.13860949873924255, "rewards/bleu_reward_func/std": 0.15230515599250793, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 370.0, "completions/mean_terminated_length": 322.66668701171875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.3312, "grad_norm": 2.6279470920562744, "kl": 0.02838134765625, "learning_rate": 1e-06, "loss": 0.1011, "num_tokens": 5465741.0, "reward": 0.07638199627399445, "reward_std": 0.018498672172427177, "rewards/bleu_reward_func/mean": 0.07638199627399445, "rewards/bleu_reward_func/std": 0.07297802716493607, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 433.96875, "completions/mean_terminated_length": 403.4347839355469, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.332, "grad_norm": 2.4823691844940186, "kl": 0.035186767578125, "learning_rate": 1e-06, "loss": -0.0328, "num_tokens": 5482924.0, "reward": 0.06871578842401505, "reward_std": 0.015666324645280838, "rewards/bleu_reward_func/mean": 0.06871578842401505, "rewards/bleu_reward_func/std": 0.03051225282251835, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 241.5, "completions/mean_terminated_length": 179.07693481445312, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3328, "grad_norm": 6.543938159942627, "kl": 0.16180419921875, "learning_rate": 1e-06, "loss": 0.0852, "num_tokens": 5494084.0, "reward": 0.1368054300546646, "reward_std": 0.05007235333323479, "rewards/bleu_reward_func/mean": 0.1368054300546646, "rewards/bleu_reward_func/std": 0.17140735685825348, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 440.0, "completions/mean_terminated_length": 390.7368469238281, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.3336, "grad_norm": 2.235297203063965, "kl": 0.03033447265625, "learning_rate": 1e-06, "loss": -0.0838, "num_tokens": 5511716.0, "reward": 0.038143888115882874, "reward_std": 0.01655811443924904, "rewards/bleu_reward_func/mean": 0.038143888115882874, "rewards/bleu_reward_func/std": 0.024868454784154892, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 182.3125, "completions/mean_terminated_length": 72.41667175292969, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.3344, "grad_norm": 7.744441509246826, "kl": 0.191436767578125, "learning_rate": 1e-06, "loss": 0.3195, "num_tokens": 5523022.0, "reward": 0.31701600551605225, "reward_std": 0.07194612175226212, "rewards/bleu_reward_func/mean": 0.31701600551605225, "rewards/bleu_reward_func/std": 0.3555218279361725, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 366.15625, "completions/mean_terminated_length": 237.47059631347656, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.3352, "grad_norm": 6.1128129959106445, "kl": 0.156463623046875, "learning_rate": 1e-06, "loss": -0.0201, "num_tokens": 5541259.0, "reward": 0.08823719620704651, "reward_std": 0.024577319622039795, "rewards/bleu_reward_func/mean": 0.08823719620704651, "rewards/bleu_reward_func/std": 0.06854464113712311, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 441.34375, "completions/mean_terminated_length": 361.2666931152344, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.336, "grad_norm": 1.8351125717163086, "kl": 0.021087646484375, "learning_rate": 1e-06, "loss": 0.1005, "num_tokens": 5559934.0, "reward": 0.04894189164042473, "reward_std": 0.02001025900244713, "rewards/bleu_reward_func/mean": 0.04894189164042473, "rewards/bleu_reward_func/std": 0.05484846979379654, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 326.625, "completions/mean_terminated_length": 264.8333435058594, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3368, "grad_norm": 2.533376932144165, "kl": 0.036773681640625, "learning_rate": 1e-06, "loss": -0.0233, "num_tokens": 5573530.0, "reward": 0.040375903248786926, "reward_std": 0.020407570526003838, "rewards/bleu_reward_func/mean": 0.040375903248786926, "rewards/bleu_reward_func/std": 0.03530384972691536, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 387.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.3376, "grad_norm": 2.77024507522583, "kl": 0.023895263671875, "learning_rate": 1e-06, "loss": -0.0548, "num_tokens": 5587906.0, "reward": 0.07852312177419662, "reward_std": 0.01865551620721817, "rewards/bleu_reward_func/mean": 0.07852312177419662, "rewards/bleu_reward_func/std": 0.01962001994252205, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 223.51998901367188, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.3384, "grad_norm": 10.24564266204834, "kl": 0.1881103515625, "learning_rate": 1e-06, "loss": 0.3541, "num_tokens": 5600862.0, "reward": 0.1451932042837143, "reward_std": 0.04526112228631973, "rewards/bleu_reward_func/mean": 0.1451932042837143, "rewards/bleu_reward_func/std": 0.11114869266748428, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 374.0, "completions/mean_terminated_length": 217.60000610351562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3392, "grad_norm": 3.2997934818267822, "kl": 0.03955078125, "learning_rate": 1e-06, "loss": 0.0442, "num_tokens": 5614662.0, "reward": 0.029227450489997864, "reward_std": 0.015134407207369804, "rewards/bleu_reward_func/mean": 0.029227450489997864, "rewards/bleu_reward_func/std": 0.03273903205990791, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 266.59375, "completions/mean_terminated_length": 184.7916717529297, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.34, "grad_norm": 3.9605484008789062, "kl": 0.05718994140625, "learning_rate": 1e-06, "loss": 0.3365, "num_tokens": 5625193.0, "reward": 0.07731406390666962, "reward_std": 0.04166540876030922, "rewards/bleu_reward_func/mean": 0.07731406390666962, "rewards/bleu_reward_func/std": 0.07211390882730484, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 88.63157653808594, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3408, "grad_norm": 11.257648468017578, "kl": 0.2677154541015625, "learning_rate": 1e-06, "loss": 0.1169, "num_tokens": 5640717.0, "reward": 0.19435667991638184, "reward_std": 0.055491410195827484, "rewards/bleu_reward_func/mean": 0.19435667991638184, "rewards/bleu_reward_func/std": 0.1956581324338913, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 376.9375, "completions/mean_terminated_length": 179.53846740722656, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3416, "grad_norm": 25.653825759887695, "kl": 0.1090087890625, "learning_rate": 1e-06, "loss": 0.024, "num_tokens": 5655923.0, "reward": 0.11750101298093796, "reward_std": 0.0449095293879509, "rewards/bleu_reward_func/mean": 0.11750101298093796, "rewards/bleu_reward_func/std": 0.10332971811294556, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 313.0, "completions/mean_terminated_length": 235.13043212890625, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.3424, "grad_norm": 3.291689157485962, "kl": 0.05206298828125, "learning_rate": 1e-06, "loss": -0.0722, "num_tokens": 5672371.0, "reward": 0.07329948246479034, "reward_std": 0.04769134148955345, "rewards/bleu_reward_func/mean": 0.07329948246479034, "rewards/bleu_reward_func/std": 0.10588011890649796, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 218.1875, "completions/mean_terminated_length": 176.21429443359375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3432, "grad_norm": 70.38253784179688, "kl": 0.111907958984375, "learning_rate": 1e-06, "loss": 0.2102, "num_tokens": 5685137.0, "reward": 0.057677462697029114, "reward_std": 0.02635624073445797, "rewards/bleu_reward_func/mean": 0.057677462697029114, "rewards/bleu_reward_func/std": 0.03576910123229027, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 276.34375, "completions/mean_terminated_length": 184.13043212890625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.344, "grad_norm": 6.003584861755371, "kl": 0.0679931640625, "learning_rate": 1e-06, "loss": 0.0754, "num_tokens": 5699164.0, "reward": 0.1447058618068695, "reward_std": 0.02169397845864296, "rewards/bleu_reward_func/mean": 0.1447058618068695, "rewards/bleu_reward_func/std": 0.17934927344322205, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 179.61289978027344, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3448, "grad_norm": 6.911223888397217, "kl": 0.186920166015625, "learning_rate": 1e-06, "loss": -0.1198, "num_tokens": 5707324.0, "reward": 0.1218734011054039, "reward_std": 0.029896825551986694, "rewards/bleu_reward_func/mean": 0.1218734011054039, "rewards/bleu_reward_func/std": 0.12784428894519806, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 161.3125, "completions/mean_terminated_length": 161.3125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3456, "grad_norm": 7.491186141967773, "kl": 0.174072265625, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 5715774.0, "reward": 0.24741162359714508, "reward_std": 0.06959841400384903, "rewards/bleu_reward_func/mean": 0.24741162359714508, "rewards/bleu_reward_func/std": 0.12952403724193573, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 360.375, "completions/mean_terminated_length": 280.952392578125, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.3464, "grad_norm": 2.933356523513794, "kl": 0.05389404296875, "learning_rate": 1e-06, "loss": 0.1508, "num_tokens": 5729490.0, "reward": 0.047768086194992065, "reward_std": 0.022835325449705124, "rewards/bleu_reward_func/mean": 0.047768086194992065, "rewards/bleu_reward_func/std": 0.03785131126642227, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 290.65625, "completions/mean_terminated_length": 216.875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3472, "grad_norm": 4.7709503173828125, "kl": 0.202667236328125, "learning_rate": 1e-06, "loss": -0.0613, "num_tokens": 5744007.0, "reward": 0.17955930531024933, "reward_std": 0.04158224165439606, "rewards/bleu_reward_func/mean": 0.17955930531024933, "rewards/bleu_reward_func/std": 0.16465015709400177, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 471.46875, "completions/mean_terminated_length": 430.9375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.348, "grad_norm": 2.0240750312805176, "kl": 0.0249786376953125, "learning_rate": 1e-06, "loss": -0.0115, "num_tokens": 5764798.0, "reward": 0.06078977510333061, "reward_std": 0.014253700152039528, "rewards/bleu_reward_func/mean": 0.06078977510333061, "rewards/bleu_reward_func/std": 0.061424292623996735, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 297.5, "completions/mean_terminated_length": 213.56521606445312, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.3488, "grad_norm": 6.2941999435424805, "kl": 0.14788818359375, "learning_rate": 1e-06, "loss": -0.117, "num_tokens": 5778070.0, "reward": 0.18015003204345703, "reward_std": 0.04164495691657066, "rewards/bleu_reward_func/mean": 0.18015003204345703, "rewards/bleu_reward_func/std": 0.25248411297798157, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 319.0625, "completions/mean_terminated_length": 265.0400085449219, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.3496, "grad_norm": 6.8885884284973145, "kl": 0.15325927734375, "learning_rate": 1e-06, "loss": 0.1008, "num_tokens": 5794464.0, "reward": 0.06313855201005936, "reward_std": 0.01877717673778534, "rewards/bleu_reward_func/mean": 0.06313855201005936, "rewards/bleu_reward_func/std": 0.07749292254447937, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 174.96875, "completions/mean_terminated_length": 126.8214340209961, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3504, "grad_norm": 6.638815402984619, "kl": 0.21893310546875, "learning_rate": 1e-06, "loss": -0.1077, "num_tokens": 5808279.0, "reward": 0.1649433970451355, "reward_std": 0.03847195580601692, "rewards/bleu_reward_func/mean": 0.1649433970451355, "rewards/bleu_reward_func/std": 0.1434909999370575, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 111.46875, "completions/mean_terminated_length": 111.46875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3512, "grad_norm": 8.658287048339844, "kl": 0.380615234375, "learning_rate": 1e-06, "loss": -0.0241, "num_tokens": 5818006.0, "reward": 0.16367265582084656, "reward_std": 0.043664492666721344, "rewards/bleu_reward_func/mean": 0.16367265582084656, "rewards/bleu_reward_func/std": 0.09786061942577362, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 183.0625, "completions/mean_terminated_length": 161.1333465576172, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.352, "grad_norm": 6.755293369293213, "kl": 0.094757080078125, "learning_rate": 1e-06, "loss": -0.3775, "num_tokens": 5827832.0, "reward": 0.20365653932094574, "reward_std": 0.022682592272758484, "rewards/bleu_reward_func/mean": 0.20365653932094574, "rewards/bleu_reward_func/std": 0.28341981768608093, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 374.15625, "completions/mean_terminated_length": 196.92857360839844, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.3528, "grad_norm": 3.652517557144165, "kl": 0.04571533203125, "learning_rate": 1e-06, "loss": 0.1243, "num_tokens": 5846877.0, "reward": 0.028015542775392532, "reward_std": 0.017580918967723846, "rewards/bleu_reward_func/mean": 0.028015542775392532, "rewards/bleu_reward_func/std": 0.018063105642795563, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 247.96875, "completions/mean_terminated_length": 174.0399932861328, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3536, "grad_norm": 5.749145984649658, "kl": 0.2269287109375, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 5857276.0, "reward": 0.24086514115333557, "reward_std": 0.11034538596868515, "rewards/bleu_reward_func/mean": 0.24086514115333557, "rewards/bleu_reward_func/std": 0.2930907607078552, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 180.9375, "completions/mean_terminated_length": 88.23999786376953, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3544, "grad_norm": 6.519045352935791, "kl": 0.3109130859375, "learning_rate": 1e-06, "loss": 0.0523, "num_tokens": 5866418.0, "reward": 0.14787587523460388, "reward_std": 0.08442827314138412, "rewards/bleu_reward_func/mean": 0.14787587523460388, "rewards/bleu_reward_func/std": 0.13120223581790924, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 290.6875, "completions/mean_terminated_length": 239.61538696289062, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.3552, "grad_norm": 3.2144103050231934, "kl": 0.042388916015625, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 5878832.0, "reward": 0.06585465371608734, "reward_std": 0.03217202052474022, "rewards/bleu_reward_func/mean": 0.06585465371608734, "rewards/bleu_reward_func/std": 0.0564405731856823, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 226.0625, "completions/mean_terminated_length": 196.48275756835938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.356, "grad_norm": 7.220034122467041, "kl": 0.255218505859375, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 5894266.0, "reward": 0.1998336911201477, "reward_std": 0.05887780338525772, "rewards/bleu_reward_func/mean": 0.1998336911201477, "rewards/bleu_reward_func/std": 0.1896047741174698, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 194.46875, "completions/mean_terminated_length": 173.3000030517578, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.3568, "grad_norm": 5.338675022125244, "kl": 0.112518310546875, "learning_rate": 1e-06, "loss": 0.1041, "num_tokens": 5902393.0, "reward": 0.08252020180225372, "reward_std": 0.041884347796440125, "rewards/bleu_reward_func/mean": 0.08252020180225372, "rewards/bleu_reward_func/std": 0.05604247748851776, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 266.96875, "completions/mean_terminated_length": 231.96429443359375, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.3576, "grad_norm": 3.9111521244049072, "kl": 0.048919677734375, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 5914296.0, "reward": 0.2005537748336792, "reward_std": 0.03531679883599281, "rewards/bleu_reward_func/mean": 0.2005537748336792, "rewards/bleu_reward_func/std": 0.1125224232673645, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 89.11111450195312, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.3584, "grad_norm": 18.863727569580078, "kl": 0.269134521484375, "learning_rate": 1e-06, "loss": 0.4411, "num_tokens": 5927172.0, "reward": 0.12709318101406097, "reward_std": 0.020968245342373848, "rewards/bleu_reward_func/mean": 0.12709318101406097, "rewards/bleu_reward_func/std": 0.14331206679344177, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 378.21875, "completions/mean_terminated_length": 297.95001220703125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.3592, "grad_norm": 2.759582996368408, "kl": 0.02587890625, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 5944491.0, "reward": 0.04890431463718414, "reward_std": 0.01871412619948387, "rewards/bleu_reward_func/mean": 0.04890431463718414, "rewards/bleu_reward_func/std": 0.05281543731689453, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 234.8125, "completions/mean_terminated_length": 216.33334350585938, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.36, "grad_norm": 5.44106388092041, "kl": 0.08453369140625, "learning_rate": 1e-06, "loss": -0.2079, "num_tokens": 5954413.0, "reward": 0.08892585337162018, "reward_std": 0.05316928029060364, "rewards/bleu_reward_func/mean": 0.08892585337162018, "rewards/bleu_reward_func/std": 0.09096309542655945, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 182.59375, "completions/mean_terminated_length": 90.36000061035156, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3608, "grad_norm": 10.483473777770996, "kl": 0.30169677734375, "learning_rate": 1e-06, "loss": 0.1221, "num_tokens": 5965776.0, "reward": 0.2010711133480072, "reward_std": 0.035105034708976746, "rewards/bleu_reward_func/mean": 0.2010711133480072, "rewards/bleu_reward_func/std": 0.20054543018341064, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 266.4827575683594, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.3616, "grad_norm": 8.551454544067383, "kl": 0.21197509765625, "learning_rate": 1e-06, "loss": 0.0288, "num_tokens": 5976704.0, "reward": 0.03945029526948929, "reward_std": 0.011974655091762543, "rewards/bleu_reward_func/mean": 0.03945029526948929, "rewards/bleu_reward_func/std": 0.027504391968250275, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 241.90625, "completions/mean_terminated_length": 241.90625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3624, "grad_norm": 5.853670597076416, "kl": 0.209869384765625, "learning_rate": 1e-06, "loss": -0.0173, "num_tokens": 5987269.0, "reward": 0.09715719521045685, "reward_std": 0.009554330259561539, "rewards/bleu_reward_func/mean": 0.09715719521045685, "rewards/bleu_reward_func/std": 0.0827893614768982, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 258.6875, "completions/mean_terminated_length": 143.5454559326172, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3632, "grad_norm": 9.270416259765625, "kl": 0.208038330078125, "learning_rate": 1e-06, "loss": 0.0672, "num_tokens": 6001411.0, "reward": 0.1554635763168335, "reward_std": 0.03311417996883392, "rewards/bleu_reward_func/mean": 0.1554635763168335, "rewards/bleu_reward_func/std": 0.1801016479730606, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 196.65625, "completions/mean_terminated_length": 196.65625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.364, "grad_norm": 11.34135913848877, "kl": 0.224945068359375, "learning_rate": 1e-06, "loss": 0.3216, "num_tokens": 6012208.0, "reward": 0.058545198291540146, "reward_std": 0.017396699637174606, "rewards/bleu_reward_func/mean": 0.058545198291540146, "rewards/bleu_reward_func/std": 0.04106508567929268, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 137.8125, "completions/mean_terminated_length": 137.8125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.3648, "grad_norm": 5.569085597991943, "kl": 0.173675537109375, "learning_rate": 1e-06, "loss": -0.0886, "num_tokens": 6020026.0, "reward": 0.25735002756118774, "reward_std": 0.08652571588754654, "rewards/bleu_reward_func/mean": 0.25735002756118774, "rewards/bleu_reward_func/std": 0.34091776609420776, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 261.0625, "completions/mean_terminated_length": 162.86956787109375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3656, "grad_norm": 10.537775993347168, "kl": 0.191162109375, "learning_rate": 1e-06, "loss": 0.1318, "num_tokens": 6031956.0, "reward": 0.12902843952178955, "reward_std": 0.049239080399274826, "rewards/bleu_reward_func/mean": 0.12902843952178955, "rewards/bleu_reward_func/std": 0.1560073047876358, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 267.65625, "completions/mean_terminated_length": 156.59091186523438, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.3664, "grad_norm": 8.385242462158203, "kl": 0.13568115234375, "learning_rate": 1e-06, "loss": 0.1967, "num_tokens": 6048289.0, "reward": 0.09441059827804565, "reward_std": 0.02894745022058487, "rewards/bleu_reward_func/mean": 0.09441059827804565, "rewards/bleu_reward_func/std": 0.07357289642095566, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 144.3125, "completions/mean_terminated_length": 76.22222137451172, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3672, "grad_norm": 9.128081321716309, "kl": 0.275970458984375, "learning_rate": 1e-06, "loss": -0.0309, "num_tokens": 6060955.0, "reward": 0.23786574602127075, "reward_std": 0.04663696512579918, "rewards/bleu_reward_func/mean": 0.23786574602127075, "rewards/bleu_reward_func/std": 0.15007296204566956, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 293.25, "completions/mean_terminated_length": 262.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.368, "grad_norm": 10.163530349731445, "kl": 0.1614837646484375, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 6075291.0, "reward": 0.11764833331108093, "reward_std": 0.025302093476057053, "rewards/bleu_reward_func/mean": 0.11764833331108093, "rewards/bleu_reward_func/std": 0.054068438708782196, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 160.78125, "completions/mean_terminated_length": 137.36666870117188, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3688, "grad_norm": 7.89539909362793, "kl": 0.18206787109375, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 6083980.0, "reward": 0.0945214033126831, "reward_std": 0.046040039509534836, "rewards/bleu_reward_func/mean": 0.0945214033126831, "rewards/bleu_reward_func/std": 0.08345890045166016, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 254.96875, "completions/mean_terminated_length": 246.6774139404297, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3696, "grad_norm": 6.462737560272217, "kl": 0.130950927734375, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 6096555.0, "reward": 0.04283145070075989, "reward_std": 0.010249357670545578, "rewards/bleu_reward_func/mean": 0.04283145070075989, "rewards/bleu_reward_func/std": 0.038907162845134735, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 190.875, "completions/mean_terminated_length": 83.83333587646484, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3704, "grad_norm": 5.569899559020996, "kl": 0.115325927734375, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 6105487.0, "reward": 0.13501238822937012, "reward_std": 0.034556735306978226, "rewards/bleu_reward_func/mean": 0.13501238822937012, "rewards/bleu_reward_func/std": 0.09039971977472305, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 219.53125, "completions/mean_terminated_length": 137.63999938964844, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.3712, "grad_norm": 12.397187232971191, "kl": 0.133880615234375, "learning_rate": 1e-06, "loss": -0.0829, "num_tokens": 6117800.0, "reward": 0.18308544158935547, "reward_std": 0.06162799149751663, "rewards/bleu_reward_func/mean": 0.18308544158935547, "rewards/bleu_reward_func/std": 0.15996244549751282, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 220.65625, "completions/mean_terminated_length": 190.51724243164062, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.372, "grad_norm": 7.391514301300049, "kl": 0.11492919921875, "learning_rate": 1e-06, "loss": -0.0768, "num_tokens": 6128413.0, "reward": 0.05292118340730667, "reward_std": 0.04890108108520508, "rewards/bleu_reward_func/mean": 0.05292118340730667, "rewards/bleu_reward_func/std": 0.07255055755376816, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 194.15625, "completions/mean_terminated_length": 194.15625, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.3728, "grad_norm": 3.9842867851257324, "kl": 0.050048828125, "learning_rate": 1e-06, "loss": -0.0064, "num_tokens": 6137562.0, "reward": 0.04538443684577942, "reward_std": 0.024577371776103973, "rewards/bleu_reward_func/mean": 0.04538443684577942, "rewards/bleu_reward_func/std": 0.03160402178764343, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 368.0, "completions/mean_terminated_length": 311.6521911621094, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.3736, "grad_norm": 2.496399402618408, "kl": 0.025543212890625, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 6152090.0, "reward": 0.062375668436288834, "reward_std": 0.031018512323498726, "rewards/bleu_reward_func/mean": 0.062375668436288834, "rewards/bleu_reward_func/std": 0.06766829639673233, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 341.59375, "completions/mean_terminated_length": 302.2692565917969, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.3744, "grad_norm": 4.490657329559326, "kl": 0.042816162109375, "learning_rate": 1e-06, "loss": 0.2011, "num_tokens": 6165917.0, "reward": 0.06601699441671371, "reward_std": 0.028723105788230896, "rewards/bleu_reward_func/mean": 0.06601699441671371, "rewards/bleu_reward_func/std": 0.039854664355516434, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 250.21875, "completions/mean_terminated_length": 147.78260803222656, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3752, "grad_norm": 4.7490010261535645, "kl": 0.1627197265625, "learning_rate": 1e-06, "loss": -0.0409, "num_tokens": 6178940.0, "reward": 0.15887555480003357, "reward_std": 0.018191883340477943, "rewards/bleu_reward_func/mean": 0.15887555480003357, "rewards/bleu_reward_func/std": 0.21522025763988495, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 396.125, "completions/mean_terminated_length": 264.8000183105469, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.376, "grad_norm": 3.33166241645813, "kl": 0.046051025390625, "learning_rate": 1e-06, "loss": -0.0611, "num_tokens": 6194232.0, "reward": 0.0860922709107399, "reward_std": 0.04104076325893402, "rewards/bleu_reward_func/mean": 0.0860922709107399, "rewards/bleu_reward_func/std": 0.13754135370254517, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 364.6875, "completions/mean_terminated_length": 323.44000244140625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.3768, "grad_norm": 2.6695375442504883, "kl": 0.038360595703125, "learning_rate": 1e-06, "loss": -0.0899, "num_tokens": 6207750.0, "reward": 0.05763555318117142, "reward_std": 0.022492559626698494, "rewards/bleu_reward_func/mean": 0.05763555318117142, "rewards/bleu_reward_func/std": 0.034512683749198914, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 237.40625, "completions/mean_terminated_length": 209.0, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.3776, "grad_norm": 4.532895565032959, "kl": 0.088165283203125, "learning_rate": 1e-06, "loss": 0.0908, "num_tokens": 6220235.0, "reward": 0.07317312806844711, "reward_std": 0.02968096360564232, "rewards/bleu_reward_func/mean": 0.07317312806844711, "rewards/bleu_reward_func/std": 0.04997172951698303, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 263.69232177734375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3784, "grad_norm": 2.9320926666259766, "kl": 0.04736328125, "learning_rate": 1e-06, "loss": -0.0956, "num_tokens": 6233691.0, "reward": 0.07909499108791351, "reward_std": 0.02384771592915058, "rewards/bleu_reward_func/mean": 0.07909499108791351, "rewards/bleu_reward_func/std": 0.08157114684581757, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 276.8125, "completions/mean_terminated_length": 233.25926208496094, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.3792, "grad_norm": 5.839748859405518, "kl": 0.14556884765625, "learning_rate": 1e-06, "loss": -0.0466, "num_tokens": 6245669.0, "reward": 0.10992265492677689, "reward_std": 0.027910416945815086, "rewards/bleu_reward_func/mean": 0.10992265492677689, "rewards/bleu_reward_func/std": 0.11659030616283417, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 194.09375, "completions/mean_terminated_length": 172.90000915527344, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.38, "grad_norm": 22.791318893432617, "kl": 0.30963134765625, "learning_rate": 1e-06, "loss": -0.0749, "num_tokens": 6255632.0, "reward": 0.14596156775951385, "reward_std": 0.0427117757499218, "rewards/bleu_reward_func/mean": 0.14596156775951385, "rewards/bleu_reward_func/std": 0.06039505451917648, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 147.03125, "completions/mean_terminated_length": 147.03125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3808, "grad_norm": 7.391219615936279, "kl": 0.34600830078125, "learning_rate": 1e-06, "loss": -0.0711, "num_tokens": 6267817.0, "reward": 0.155485600233078, "reward_std": 0.03775210678577423, "rewards/bleu_reward_func/mean": 0.155485600233078, "rewards/bleu_reward_func/std": 0.14854131639003754, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 176.09375, "completions/mean_terminated_length": 153.70001220703125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.3816, "grad_norm": 7.248151779174805, "kl": 0.1455078125, "learning_rate": 1e-06, "loss": -0.1443, "num_tokens": 6276772.0, "reward": 0.08080196380615234, "reward_std": 0.06804326176643372, "rewards/bleu_reward_func/mean": 0.08080196380615234, "rewards/bleu_reward_func/std": 0.11115432530641556, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 230.03125, "completions/mean_terminated_length": 151.0800018310547, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3824, "grad_norm": 8.359848022460938, "kl": 0.151123046875, "learning_rate": 1e-06, "loss": 0.36, "num_tokens": 6287581.0, "reward": 0.06686853617429733, "reward_std": 0.028161579743027687, "rewards/bleu_reward_func/mean": 0.06686853617429733, "rewards/bleu_reward_func/std": 0.054127294570207596, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 120.03125, "completions/mean_terminated_length": 120.03125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3832, "grad_norm": 7.5159101486206055, "kl": 0.190155029296875, "learning_rate": 1e-06, "loss": 0.1074, "num_tokens": 6297390.0, "reward": 0.19040237367153168, "reward_std": 0.05353376269340515, "rewards/bleu_reward_func/mean": 0.19040237367153168, "rewards/bleu_reward_func/std": 0.17947913706302643, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 451.4375, "completions/mean_terminated_length": 410.0, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.384, "grad_norm": 2.1026315689086914, "kl": 0.0289306640625, "learning_rate": 1e-06, "loss": 0.046, "num_tokens": 6314548.0, "reward": 0.09041387587785721, "reward_std": 0.04015309736132622, "rewards/bleu_reward_func/mean": 0.09041387587785721, "rewards/bleu_reward_func/std": 0.09059884399175644, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 319.5625, "completions/mean_terminated_length": 204.10000610351562, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.3848, "grad_norm": 5.199352264404297, "kl": 0.172821044921875, "learning_rate": 1e-06, "loss": -0.062, "num_tokens": 6333886.0, "reward": 0.13319844007492065, "reward_std": 0.03567848354578018, "rewards/bleu_reward_func/mean": 0.13319844007492065, "rewards/bleu_reward_func/std": 0.12437637895345688, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 208.9375, "completions/mean_terminated_length": 152.8148193359375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.3856, "grad_norm": 6.110198497772217, "kl": 0.2414398193359375, "learning_rate": 1e-06, "loss": -0.0203, "num_tokens": 6346564.0, "reward": 0.19878074526786804, "reward_std": 0.043283406645059586, "rewards/bleu_reward_func/mean": 0.19878074526786804, "rewards/bleu_reward_func/std": 0.1821635365486145, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 237.53125, "completions/mean_terminated_length": 186.70370483398438, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.3864, "grad_norm": 8.788106918334961, "kl": 0.17132568359375, "learning_rate": 1e-06, "loss": -0.0653, "num_tokens": 6358741.0, "reward": 0.07478289306163788, "reward_std": 0.019201520830392838, "rewards/bleu_reward_func/mean": 0.07478289306163788, "rewards/bleu_reward_func/std": 0.05620751157402992, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 202.4375, "completions/mean_terminated_length": 115.75999450683594, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.3872, "grad_norm": 7.577336311340332, "kl": 0.109588623046875, "learning_rate": 1e-06, "loss": 0.3355, "num_tokens": 6371787.0, "reward": 0.09253311157226562, "reward_std": 0.03513386473059654, "rewards/bleu_reward_func/mean": 0.09253311157226562, "rewards/bleu_reward_func/std": 0.0667162612080574, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 281.1199951171875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.388, "grad_norm": 5.319460391998291, "kl": 0.0972900390625, "learning_rate": 1e-06, "loss": 0.057, "num_tokens": 6388231.0, "reward": 0.16802164912223816, "reward_std": 0.024459581822156906, "rewards/bleu_reward_func/mean": 0.16802164912223816, "rewards/bleu_reward_func/std": 0.17531749606132507, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 222.6875, "completions/mean_terminated_length": 155.92308044433594, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3888, "grad_norm": 9.174544334411621, "kl": 0.21746826171875, "learning_rate": 1e-06, "loss": 0.0552, "num_tokens": 6399885.0, "reward": 0.20374764502048492, "reward_std": 0.02469576895236969, "rewards/bleu_reward_func/mean": 0.20374764502048492, "rewards/bleu_reward_func/std": 0.17774522304534912, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 212.40625, "completions/mean_terminated_length": 112.54167175292969, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.3896, "grad_norm": 8.529189109802246, "kl": 0.412689208984375, "learning_rate": 1e-06, "loss": -0.0442, "num_tokens": 6410746.0, "reward": 0.13253280520439148, "reward_std": 0.03401318937540054, "rewards/bleu_reward_func/mean": 0.13253280520439148, "rewards/bleu_reward_func/std": 0.09572894126176834, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 343.40625, "completions/mean_terminated_length": 277.4347839355469, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.3904, "grad_norm": 5.815334796905518, "kl": 0.06719970703125, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 6424639.0, "reward": 0.14998552203178406, "reward_std": 0.03536435216665268, "rewards/bleu_reward_func/mean": 0.14998552203178406, "rewards/bleu_reward_func/std": 0.08015048503875732, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 242.21875, "completions/mean_terminated_length": 192.25926208496094, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.3912, "grad_norm": 8.644153594970703, "kl": 0.204193115234375, "learning_rate": 1e-06, "loss": 0.2233, "num_tokens": 6437382.0, "reward": 0.08585190027952194, "reward_std": 0.032436732202768326, "rewards/bleu_reward_func/mean": 0.08585190027952194, "rewards/bleu_reward_func/std": 0.10239724069833755, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 108.20000457763672, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.392, "grad_norm": 18.119718551635742, "kl": 0.39862060546875, "learning_rate": 1e-06, "loss": 0.0372, "num_tokens": 6450634.0, "reward": 0.07857100665569305, "reward_std": 0.010440990328788757, "rewards/bleu_reward_func/mean": 0.07857100665569305, "rewards/bleu_reward_func/std": 0.06719467043876648, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 348.28125, "completions/mean_terminated_length": 262.5238037109375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.3928, "grad_norm": 2.811199903488159, "kl": 0.03350830078125, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 6464187.0, "reward": 0.07400047779083252, "reward_std": 0.021461695432662964, "rewards/bleu_reward_func/mean": 0.07400047779083252, "rewards/bleu_reward_func/std": 0.061210907995700836, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 225.53125, "completions/mean_terminated_length": 172.48147583007812, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.3936, "grad_norm": 8.102995872497559, "kl": 0.17437744140625, "learning_rate": 1e-06, "loss": -0.0621, "num_tokens": 6477388.0, "reward": 0.08205416798591614, "reward_std": 0.02140321210026741, "rewards/bleu_reward_func/mean": 0.08205416798591614, "rewards/bleu_reward_func/std": 0.06504324823617935, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 248.40625, "completions/mean_terminated_length": 239.90321350097656, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.3944, "grad_norm": 5.510415554046631, "kl": 0.054595947265625, "learning_rate": 1e-06, "loss": 0.1424, "num_tokens": 6490681.0, "reward": 0.09917749464511871, "reward_std": 0.03953540325164795, "rewards/bleu_reward_func/mean": 0.09917749464511871, "rewards/bleu_reward_func/std": 0.062214821577072144, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 347.65625, "completions/mean_terminated_length": 202.64706420898438, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.3952, "grad_norm": 3.754049301147461, "kl": 0.065643310546875, "learning_rate": 1e-06, "loss": -0.0312, "num_tokens": 6508054.0, "reward": 0.04995376244187355, "reward_std": 0.018671657890081406, "rewards/bleu_reward_func/mean": 0.04995376244187355, "rewards/bleu_reward_func/std": 0.021997425705194473, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 304.78125, "completions/mean_terminated_length": 121.94117736816406, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.396, "grad_norm": 3.28092098236084, "kl": 0.0880889892578125, "learning_rate": 1e-06, "loss": 0.2271, "num_tokens": 6528167.0, "reward": 0.21464568376541138, "reward_std": 0.04326138645410538, "rewards/bleu_reward_func/mean": 0.21464568376541138, "rewards/bleu_reward_func/std": 0.2538887560367584, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 260.0, "completions/mean_terminated_length": 161.3913116455078, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3968, "grad_norm": 7.667696475982666, "kl": 0.27783203125, "learning_rate": 1e-06, "loss": -0.0064, "num_tokens": 6539503.0, "reward": 0.14023897051811218, "reward_std": 0.03843347355723381, "rewards/bleu_reward_func/mean": 0.14023897051811218, "rewards/bleu_reward_func/std": 0.12260077148675919, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 312.78125, "completions/mean_terminated_length": 275.8888854980469, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.3976, "grad_norm": 5.650123596191406, "kl": 0.122955322265625, "learning_rate": 1e-06, "loss": -0.218, "num_tokens": 6556880.0, "reward": 0.2068222463130951, "reward_std": 0.08186712116003036, "rewards/bleu_reward_func/mean": 0.2068222463130951, "rewards/bleu_reward_func/std": 0.30478134751319885, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 115.5, "completions/mean_terminated_length": 115.5, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.3984, "grad_norm": 9.522087097167969, "kl": 0.31427001953125, "learning_rate": 1e-06, "loss": 0.3453, "num_tokens": 6564648.0, "reward": 0.21922443807125092, "reward_std": 0.07997345924377441, "rewards/bleu_reward_func/mean": 0.21922443807125092, "rewards/bleu_reward_func/std": 0.12106078118085861, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 283.46875, "completions/mean_terminated_length": 194.04348754882812, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.3992, "grad_norm": 4.472853183746338, "kl": 0.10198974609375, "learning_rate": 1e-06, "loss": 0.0606, "num_tokens": 6577575.0, "reward": 0.1807648241519928, "reward_std": 0.04940491169691086, "rewards/bleu_reward_func/mean": 0.1807648241519928, "rewards/bleu_reward_func/std": 0.2276194989681244, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 257.6875, "completions/mean_terminated_length": 210.59259033203125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.4, "grad_norm": 3.429314136505127, "kl": 0.120086669921875, "learning_rate": 1e-06, "loss": 0.1092, "num_tokens": 6590341.0, "reward": 0.13892096281051636, "reward_std": 0.04246610775589943, "rewards/bleu_reward_func/mean": 0.13892096281051636, "rewards/bleu_reward_func/std": 0.12665794789791107, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 150.78125, "completions/mean_terminated_length": 126.70000457763672, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4008, "grad_norm": 6.932479381561279, "kl": 0.4072265625, "learning_rate": 1e-06, "loss": 0.1306, "num_tokens": 6604182.0, "reward": 0.13375571370124817, "reward_std": 0.05735353007912636, "rewards/bleu_reward_func/mean": 0.13375571370124817, "rewards/bleu_reward_func/std": 0.14047691226005554, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 329.375, "completions/mean_terminated_length": 257.9130554199219, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.4016, "grad_norm": 3.9977669715881348, "kl": 0.0543212890625, "learning_rate": 1e-06, "loss": 0.1227, "num_tokens": 6619994.0, "reward": 0.08314976096153259, "reward_std": 0.01850474253296852, "rewards/bleu_reward_func/mean": 0.08314976096153259, "rewards/bleu_reward_func/std": 0.03126469627022743, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 328.875, "completions/mean_terminated_length": 245.63636779785156, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.4024, "grad_norm": 8.637741088867188, "kl": 0.294342041015625, "learning_rate": 1e-06, "loss": -0.0294, "num_tokens": 6632862.0, "reward": 0.21461226046085358, "reward_std": 0.05726875364780426, "rewards/bleu_reward_func/mean": 0.21461226046085358, "rewards/bleu_reward_func/std": 0.19377335906028748, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 345.71875, "completions/mean_terminated_length": 245.9499969482422, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4032, "grad_norm": 5.415818691253662, "kl": 0.11663818359375, "learning_rate": 1e-06, "loss": -0.1008, "num_tokens": 6649573.0, "reward": 0.10018286108970642, "reward_std": 0.025530360639095306, "rewards/bleu_reward_func/mean": 0.10018286108970642, "rewards/bleu_reward_func/std": 0.08217810094356537, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 293.21875, "completions/mean_terminated_length": 123.05555725097656, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.404, "grad_norm": 5.128636837005615, "kl": 0.0567626953125, "learning_rate": 1e-06, "loss": 0.1747, "num_tokens": 6661556.0, "reward": 0.08193753659725189, "reward_std": 0.036860473453998566, "rewards/bleu_reward_func/mean": 0.08193753659725189, "rewards/bleu_reward_func/std": 0.0639234408736229, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 383.1875, "completions/mean_terminated_length": 254.375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.4048, "grad_norm": 2.5557072162628174, "kl": 0.03839111328125, "learning_rate": 1e-06, "loss": -0.0743, "num_tokens": 6678938.0, "reward": 0.05591622740030289, "reward_std": 0.017734069377183914, "rewards/bleu_reward_func/mean": 0.05591622740030289, "rewards/bleu_reward_func/std": 0.04607876017689705, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 293.625, "completions/mean_terminated_length": 75.25, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.4056, "grad_norm": 5.220037460327148, "kl": 0.07464599609375, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 6691822.0, "reward": 0.029562367126345634, "reward_std": 0.03146641328930855, "rewards/bleu_reward_func/mean": 0.029562367126345634, "rewards/bleu_reward_func/std": 0.04593721404671669, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 453.96875, "completions/mean_terminated_length": 343.18182373046875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4064, "grad_norm": 2.189230442047119, "kl": 0.026153564453125, "learning_rate": 1e-06, "loss": -0.0992, "num_tokens": 6709997.0, "reward": 0.03725602477788925, "reward_std": 0.02092660963535309, "rewards/bleu_reward_func/mean": 0.03725602477788925, "rewards/bleu_reward_func/std": 0.02429044619202614, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 106.34375, "completions/mean_terminated_length": 79.30000305175781, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4072, "grad_norm": 6.567111015319824, "kl": 0.233154296875, "learning_rate": 1e-06, "loss": 0.2905, "num_tokens": 6718912.0, "reward": 0.15163123607635498, "reward_std": 0.039707012474536896, "rewards/bleu_reward_func/mean": 0.15163123607635498, "rewards/bleu_reward_func/std": 0.12998701632022858, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 119.03125, "completions/mean_terminated_length": 106.3548355102539, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.408, "grad_norm": 8.991604804992676, "kl": 0.14703369140625, "learning_rate": 1e-06, "loss": 0.0972, "num_tokens": 6728609.0, "reward": 0.23723718523979187, "reward_std": 0.07665139436721802, "rewards/bleu_reward_func/mean": 0.23723718523979187, "rewards/bleu_reward_func/std": 0.27060666680336, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 171.23809814453125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.4088, "grad_norm": 6.349617958068848, "kl": 0.168121337890625, "learning_rate": 1e-06, "loss": 0.0688, "num_tokens": 6743789.0, "reward": 0.1937231868505478, "reward_std": 0.13082939386367798, "rewards/bleu_reward_func/mean": 0.1937231868505478, "rewards/bleu_reward_func/std": 0.25435397028923035, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 401.96875, "completions/mean_terminated_length": 291.9375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.4096, "grad_norm": 2.5390427112579346, "kl": 0.03851318359375, "learning_rate": 1e-06, "loss": 0.1042, "num_tokens": 6759732.0, "reward": 0.029224077239632607, "reward_std": 0.016936711966991425, "rewards/bleu_reward_func/mean": 0.029224077239632607, "rewards/bleu_reward_func/std": 0.022709792479872704, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 268.46875, "completions/mean_terminated_length": 79.05555725097656, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4104, "grad_norm": 3.7983713150024414, "kl": 0.137786865234375, "learning_rate": 1e-06, "loss": -0.0173, "num_tokens": 6774051.0, "reward": 0.20052862167358398, "reward_std": 0.028155002743005753, "rewards/bleu_reward_func/mean": 0.20052862167358398, "rewards/bleu_reward_func/std": 0.2302575409412384, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 388.71875, "completions/mean_terminated_length": 360.2692565917969, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.4112, "grad_norm": 2.4446346759796143, "kl": 0.028076171875, "learning_rate": 1e-06, "loss": -0.0783, "num_tokens": 6790610.0, "reward": 0.10578086227178574, "reward_std": 0.029093941673636436, "rewards/bleu_reward_func/mean": 0.10578086227178574, "rewards/bleu_reward_func/std": 0.08641202747821808, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 275.34375, "completions/mean_terminated_length": 259.5666809082031, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.412, "grad_norm": 5.883263111114502, "kl": 0.18634033203125, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 6803965.0, "reward": 0.1322258561849594, "reward_std": 0.030806170776486397, "rewards/bleu_reward_func/mean": 0.1322258561849594, "rewards/bleu_reward_func/std": 0.16078709065914154, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 333.84375, "completions/mean_terminated_length": 274.4583435058594, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.4128, "grad_norm": 3.016139507293701, "kl": 0.03173828125, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 6818840.0, "reward": 0.09323176741600037, "reward_std": 0.05342460051178932, "rewards/bleu_reward_func/mean": 0.09323176741600037, "rewards/bleu_reward_func/std": 0.06577997654676437, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 276.5625, "completions/mean_terminated_length": 210.63999938964844, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.4136, "grad_norm": 4.685121059417725, "kl": 0.050933837890625, "learning_rate": 1e-06, "loss": -0.1784, "num_tokens": 6830770.0, "reward": 0.03872024267911911, "reward_std": 0.016178004443645477, "rewards/bleu_reward_func/mean": 0.03872024267911911, "rewards/bleu_reward_func/std": 0.025313377380371094, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 362.875, "completions/mean_terminated_length": 231.2941131591797, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4144, "grad_norm": 4.639893054962158, "kl": 0.122711181640625, "learning_rate": 1e-06, "loss": 0.0545, "num_tokens": 6848302.0, "reward": 0.07996964454650879, "reward_std": 0.01709877885878086, "rewards/bleu_reward_func/mean": 0.07996964454650879, "rewards/bleu_reward_func/std": 0.10056579113006592, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 263.5714416503906, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.4152, "grad_norm": 9.085565567016602, "kl": 0.1670379638671875, "learning_rate": 1e-06, "loss": -0.1226, "num_tokens": 6859826.0, "reward": 0.10505213588476181, "reward_std": 0.05224030464887619, "rewards/bleu_reward_func/mean": 0.10505213588476181, "rewards/bleu_reward_func/std": 0.0725407749414444, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 258.3125, "completions/mean_terminated_length": 173.75, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.416, "grad_norm": 7.134840965270996, "kl": 0.175750732421875, "learning_rate": 1e-06, "loss": 0.1168, "num_tokens": 6873516.0, "reward": 0.21853026747703552, "reward_std": 0.06429094821214676, "rewards/bleu_reward_func/mean": 0.21853026747703552, "rewards/bleu_reward_func/std": 0.14174966514110565, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 328.65625, "completions/mean_terminated_length": 256.9130554199219, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.4168, "grad_norm": 6.0497517585754395, "kl": 0.131439208984375, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 6887761.0, "reward": 0.0685054138302803, "reward_std": 0.012891553342342377, "rewards/bleu_reward_func/mean": 0.0685054138302803, "rewards/bleu_reward_func/std": 0.057060711085796356, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 267.0625, "completions/mean_terminated_length": 221.70370483398438, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.4176, "grad_norm": 4.475714683532715, "kl": 0.047637939453125, "learning_rate": 1e-06, "loss": 0.1563, "num_tokens": 6901251.0, "reward": 0.18483126163482666, "reward_std": 0.02913127839565277, "rewards/bleu_reward_func/mean": 0.18483126163482666, "rewards/bleu_reward_func/std": 0.16543246805667877, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 228.53125, "completions/mean_terminated_length": 163.11538696289062, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.4184, "grad_norm": 7.8729143142700195, "kl": 0.297882080078125, "learning_rate": 1e-06, "loss": 0.0428, "num_tokens": 6912844.0, "reward": 0.1846814900636673, "reward_std": 0.10159599035978317, "rewards/bleu_reward_func/mean": 0.1846814900636673, "rewards/bleu_reward_func/std": 0.2030598670244217, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 251.09375, "completions/mean_terminated_length": 233.70001220703125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.4192, "grad_norm": 4.603794097900391, "kl": 0.08197021484375, "learning_rate": 1e-06, "loss": -0.0057, "num_tokens": 6923895.0, "reward": 0.11323156207799911, "reward_std": 0.03932211175560951, "rewards/bleu_reward_func/mean": 0.11323156207799911, "rewards/bleu_reward_func/std": 0.08274988830089569, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 76.625, "completions/mean_terminated_length": 76.625, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.42, "grad_norm": 8.708818435668945, "kl": 0.1953125, "learning_rate": 1e-06, "loss": 0.3306, "num_tokens": 6934091.0, "reward": 0.18468719720840454, "reward_std": 0.0689420998096466, "rewards/bleu_reward_func/mean": 0.18468719720840454, "rewards/bleu_reward_func/std": 0.12529541552066803, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 220.00001525878906, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.4208, "grad_norm": 15.188727378845215, "kl": 0.10748291015625, "learning_rate": 1e-06, "loss": -0.0496, "num_tokens": 6946923.0, "reward": 0.09780866652727127, "reward_std": 0.029562484472990036, "rewards/bleu_reward_func/mean": 0.09780866652727127, "rewards/bleu_reward_func/std": 0.09735672175884247, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 246.75, "completions/mean_terminated_length": 107.80952453613281, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.4216, "grad_norm": 5.919389247894287, "kl": 0.1287841796875, "learning_rate": 1e-06, "loss": 0.0944, "num_tokens": 6958675.0, "reward": 0.049182113260030746, "reward_std": 0.03928225487470627, "rewards/bleu_reward_func/mean": 0.049182113260030746, "rewards/bleu_reward_func/std": 0.05703483149409294, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 298.59375, "completions/mean_terminated_length": 268.1071472167969, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.4224, "grad_norm": 4.162198066711426, "kl": 0.048370361328125, "learning_rate": 1e-06, "loss": -0.0255, "num_tokens": 6973038.0, "reward": 0.19552364945411682, "reward_std": 0.05411393195390701, "rewards/bleu_reward_func/mean": 0.19552364945411682, "rewards/bleu_reward_func/std": 0.11564817279577255, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 343.125, "completions/mean_terminated_length": 266.3636474609375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.4232, "grad_norm": 7.422494411468506, "kl": 0.06976318359375, "learning_rate": 1e-06, "loss": 0.0526, "num_tokens": 6988034.0, "reward": 0.03407738357782364, "reward_std": 0.010626979172229767, "rewards/bleu_reward_func/mean": 0.03407738357782364, "rewards/bleu_reward_func/std": 0.027887288480997086, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 53.53125, "completions/mean_terminated_length": 53.53125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.424, "grad_norm": 13.498769760131836, "kl": 0.46209716796875, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 6995419.0, "reward": 0.24595381319522858, "reward_std": 0.09870806336402893, "rewards/bleu_reward_func/mean": 0.24595381319522858, "rewards/bleu_reward_func/std": 0.1663571149110794, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 392.5, "completions/mean_terminated_length": 287.058837890625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4248, "grad_norm": 2.2019829750061035, "kl": 0.0286407470703125, "learning_rate": 1e-06, "loss": -0.2369, "num_tokens": 7014707.0, "reward": 0.12730640172958374, "reward_std": 0.03398028016090393, "rewards/bleu_reward_func/mean": 0.12730640172958374, "rewards/bleu_reward_func/std": 0.20578297972679138, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 317.15625, "completions/mean_terminated_length": 262.6000061035156, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.4256, "grad_norm": 5.298976421356201, "kl": 0.087127685546875, "learning_rate": 1e-06, "loss": -0.0481, "num_tokens": 7030000.0, "reward": 0.06116287037730217, "reward_std": 0.04584234952926636, "rewards/bleu_reward_func/mean": 0.06116287037730217, "rewards/bleu_reward_func/std": 0.07913482189178467, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 321.34375, "completions/mean_terminated_length": 234.68182373046875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.4264, "grad_norm": 4.799532413482666, "kl": 0.1138916015625, "learning_rate": 1e-06, "loss": -0.0807, "num_tokens": 7046643.0, "reward": 0.08829933404922485, "reward_std": 0.03609791770577431, "rewards/bleu_reward_func/mean": 0.08829933404922485, "rewards/bleu_reward_func/std": 0.10983619093894958, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 196.84375, "completions/mean_terminated_length": 151.82144165039062, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4272, "grad_norm": 15.142457008361816, "kl": 0.25970458984375, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 7057718.0, "reward": 0.10968612134456635, "reward_std": 0.05676144361495972, "rewards/bleu_reward_func/mean": 0.10968612134456635, "rewards/bleu_reward_func/std": 0.1397821009159088, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 244.9375, "completions/mean_terminated_length": 195.48147583007812, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.428, "grad_norm": 6.9789934158325195, "kl": 0.092926025390625, "learning_rate": 1e-06, "loss": 0.1057, "num_tokens": 7073980.0, "reward": 0.19463014602661133, "reward_std": 0.09179598838090897, "rewards/bleu_reward_func/mean": 0.19463014602661133, "rewards/bleu_reward_func/std": 0.1903815120458603, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 122.13333892822266, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.4288, "grad_norm": 10.111763000488281, "kl": 0.143310546875, "learning_rate": 1e-06, "loss": -0.0902, "num_tokens": 7085228.0, "reward": 0.15931251645088196, "reward_std": 0.06651220470666885, "rewards/bleu_reward_func/mean": 0.15931251645088196, "rewards/bleu_reward_func/std": 0.10370245575904846, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 245.09375, "completions/mean_terminated_length": 183.5, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.4296, "grad_norm": 11.313093185424805, "kl": 0.135894775390625, "learning_rate": 1e-06, "loss": 0.2524, "num_tokens": 7098583.0, "reward": 0.08501166105270386, "reward_std": 0.03819301724433899, "rewards/bleu_reward_func/mean": 0.08501166105270386, "rewards/bleu_reward_func/std": 0.0931810513138771, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 244.80001831054688, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.4304, "grad_norm": 4.153919696807861, "kl": 0.050628662109375, "learning_rate": 1e-06, "loss": -0.1541, "num_tokens": 7108663.0, "reward": 0.06835095584392548, "reward_std": 0.042577650398015976, "rewards/bleu_reward_func/mean": 0.06835095584392548, "rewards/bleu_reward_func/std": 0.05704295262694359, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 173.48147583007812, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.4312, "grad_norm": 13.284401893615723, "kl": 0.14434814453125, "learning_rate": 1e-06, "loss": -0.0842, "num_tokens": 7118427.0, "reward": 0.08002069592475891, "reward_std": 0.029213791713118553, "rewards/bleu_reward_func/mean": 0.08002069592475891, "rewards/bleu_reward_func/std": 0.03687189891934395, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 123.65625, "completions/mean_terminated_length": 83.48275756835938, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.432, "grad_norm": 23.51561164855957, "kl": 0.1839599609375, "learning_rate": 1e-06, "loss": 0.4993, "num_tokens": 7128856.0, "reward": 0.2179010808467865, "reward_std": 0.08272600173950195, "rewards/bleu_reward_func/mean": 0.2179010808467865, "rewards/bleu_reward_func/std": 0.26301127672195435, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 332.40625, "completions/mean_terminated_length": 173.94117736816406, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.4328, "grad_norm": 10.934617042541504, "kl": 0.10113525390625, "learning_rate": 1e-06, "loss": -0.1254, "num_tokens": 7141661.0, "reward": 0.06413869559764862, "reward_std": 0.05120678246021271, "rewards/bleu_reward_func/mean": 0.06413869559764862, "rewards/bleu_reward_func/std": 0.09179537743330002, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 221.83334350585938, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.4336, "grad_norm": 3.505484104156494, "kl": 0.06707763671875, "learning_rate": 1e-06, "loss": 0.0835, "num_tokens": 7153161.0, "reward": 0.09516981989145279, "reward_std": 0.044140610843896866, "rewards/bleu_reward_func/mean": 0.09516981989145279, "rewards/bleu_reward_func/std": 0.049775656312704086, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 326.25, "completions/mean_terminated_length": 214.8000030517578, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.4344, "grad_norm": 7.354869842529297, "kl": 0.10260009765625, "learning_rate": 1e-06, "loss": -0.1239, "num_tokens": 7167649.0, "reward": 0.03533574938774109, "reward_std": 0.014214935712516308, "rewards/bleu_reward_func/mean": 0.03533574938774109, "rewards/bleu_reward_func/std": 0.027195338159799576, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 225.78125, "completions/mean_terminated_length": 159.73077392578125, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.4352, "grad_norm": 5.206735610961914, "kl": 0.09149169921875, "learning_rate": 1e-06, "loss": -0.0428, "num_tokens": 7181226.0, "reward": 0.22954684495925903, "reward_std": 0.06006891652941704, "rewards/bleu_reward_func/mean": 0.22954684495925903, "rewards/bleu_reward_func/std": 0.11863149702548981, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 250.75, "completions/mean_terminated_length": 190.4615478515625, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.436, "grad_norm": 5.510367393493652, "kl": 0.05084228515625, "learning_rate": 1e-06, "loss": 0.1971, "num_tokens": 7191810.0, "reward": 0.08453569561243057, "reward_std": 0.050511520355939865, "rewards/bleu_reward_func/mean": 0.08453569561243057, "rewards/bleu_reward_func/std": 0.07364515960216522, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 388.34375, "completions/mean_terminated_length": 314.1499938964844, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.4368, "grad_norm": 3.61635160446167, "kl": 0.045135498046875, "learning_rate": 1e-06, "loss": 0.0817, "num_tokens": 7206789.0, "reward": 0.050152119249105453, "reward_std": 0.03165213763713837, "rewards/bleu_reward_func/mean": 0.050152119249105453, "rewards/bleu_reward_func/std": 0.05620579421520233, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.4376, "grad_norm": 8.828208923339844, "kl": 0.25537109375, "learning_rate": 1e-06, "loss": 0.103, "num_tokens": 7213949.0, "reward": 0.20786888897418976, "reward_std": 0.06727642565965652, "rewards/bleu_reward_func/mean": 0.20786888897418976, "rewards/bleu_reward_func/std": 0.1706974357366562, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 196.1666717529297, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.4384, "grad_norm": 13.268147468566895, "kl": 0.058074951171875, "learning_rate": 1e-06, "loss": -0.0694, "num_tokens": 7227041.0, "reward": 0.05118046700954437, "reward_std": 0.02497515268623829, "rewards/bleu_reward_func/mean": 0.05118046700954437, "rewards/bleu_reward_func/std": 0.035916514694690704, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 351.3125, "completions/mean_terminated_length": 278.2727355957031, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.4392, "grad_norm": 13.135753631591797, "kl": 0.077850341796875, "learning_rate": 1e-06, "loss": 0.0555, "num_tokens": 7244763.0, "reward": 0.07840518653392792, "reward_std": 0.022635504603385925, "rewards/bleu_reward_func/mean": 0.07840518653392792, "rewards/bleu_reward_func/std": 0.06580173969268799, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 201.1875, "completions/mean_terminated_length": 156.7857208251953, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.44, "grad_norm": 7.055432319641113, "kl": 0.25830078125, "learning_rate": 1e-06, "loss": -0.0546, "num_tokens": 7256785.0, "reward": 0.253431499004364, "reward_std": 0.028121720999479294, "rewards/bleu_reward_func/mean": 0.253431499004364, "rewards/bleu_reward_func/std": 0.20365522801876068, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 336.8125, "completions/mean_terminated_length": 287.7599792480469, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4408, "grad_norm": 3.6197187900543213, "kl": 0.046844482421875, "learning_rate": 1e-06, "loss": 0.1119, "num_tokens": 7268987.0, "reward": 0.061213478446006775, "reward_std": 0.01489005982875824, "rewards/bleu_reward_func/mean": 0.061213478446006775, "rewards/bleu_reward_func/std": 0.038935884833335876, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 386.21875, "completions/mean_terminated_length": 351.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.4416, "grad_norm": 2.6066031455993652, "kl": 0.034149169921875, "learning_rate": 1e-06, "loss": -0.1341, "num_tokens": 7287306.0, "reward": 0.11066319048404694, "reward_std": 0.105903759598732, "rewards/bleu_reward_func/mean": 0.11066319048404694, "rewards/bleu_reward_func/std": 0.16723419725894928, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 263.28125, "completions/mean_terminated_length": 237.55172729492188, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.4424, "grad_norm": 9.281195640563965, "kl": 0.226043701171875, "learning_rate": 1e-06, "loss": -0.1571, "num_tokens": 7299899.0, "reward": 0.06672249734401703, "reward_std": 0.03525693714618683, "rewards/bleu_reward_func/mean": 0.06672249734401703, "rewards/bleu_reward_func/std": 0.0810592845082283, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 233.84375, "completions/mean_terminated_length": 182.3333282470703, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4432, "grad_norm": 7.543862342834473, "kl": 0.308807373046875, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 7313638.0, "reward": 0.30825120210647583, "reward_std": 0.07663644850254059, "rewards/bleu_reward_func/mean": 0.30825120210647583, "rewards/bleu_reward_func/std": 0.1689450740814209, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 246.40625, "completions/mean_terminated_length": 218.9310302734375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.444, "grad_norm": 4.433272361755371, "kl": 0.104339599609375, "learning_rate": 1e-06, "loss": 0.0582, "num_tokens": 7324939.0, "reward": 0.19763408601284027, "reward_std": 0.028635632246732712, "rewards/bleu_reward_func/mean": 0.19763408601284027, "rewards/bleu_reward_func/std": 0.18309614062309265, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 62.96875, "completions/mean_terminated_length": 48.48386764526367, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4448, "grad_norm": 10.322346687316895, "kl": 0.1849365234375, "learning_rate": 1e-06, "loss": 0.432, "num_tokens": 7330802.0, "reward": 0.19210518896579742, "reward_std": 0.03121430240571499, "rewards/bleu_reward_func/mean": 0.19210518896579742, "rewards/bleu_reward_func/std": 0.16853223741054535, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 202.78125, "completions/mean_terminated_length": 182.1666717529297, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.4456, "grad_norm": 9.519190788269043, "kl": 0.171844482421875, "learning_rate": 1e-06, "loss": 0.0451, "num_tokens": 7339683.0, "reward": 0.170665442943573, "reward_std": 0.06568457931280136, "rewards/bleu_reward_func/mean": 0.170665442943573, "rewards/bleu_reward_func/std": 0.1584860235452652, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 123.04762268066406, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4464, "grad_norm": 5.70733118057251, "kl": 0.177093505859375, "learning_rate": 1e-06, "loss": -0.0177, "num_tokens": 7354059.0, "reward": 0.11887075752019882, "reward_std": 0.037268251180648804, "rewards/bleu_reward_func/mean": 0.11887075752019882, "rewards/bleu_reward_func/std": 0.09704269468784332, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 237.28125, "completions/mean_terminated_length": 145.70834350585938, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.4472, "grad_norm": 10.510950088500977, "kl": 0.141387939453125, "learning_rate": 1e-06, "loss": 0.2298, "num_tokens": 7369252.0, "reward": 0.11686157435178757, "reward_std": 0.06300412118434906, "rewards/bleu_reward_func/mean": 0.11686157435178757, "rewards/bleu_reward_func/std": 0.10008818656206131, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 294.21875, "completions/mean_terminated_length": 209.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.448, "grad_norm": 12.427102088928223, "kl": 0.15325927734375, "learning_rate": 1e-06, "loss": -0.1463, "num_tokens": 7384539.0, "reward": 0.10454531759023666, "reward_std": 0.032633934170007706, "rewards/bleu_reward_func/mean": 0.10454531759023666, "rewards/bleu_reward_func/std": 0.09093461185693741, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 175.84375, "completions/mean_terminated_length": 63.79166793823242, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.4488, "grad_norm": 18.402080535888672, "kl": 0.187530517578125, "learning_rate": 1e-06, "loss": 0.6927, "num_tokens": 7394846.0, "reward": 0.21487680077552795, "reward_std": 0.08058933913707733, "rewards/bleu_reward_func/mean": 0.21487680077552795, "rewards/bleu_reward_func/std": 0.20088493824005127, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 425.21875, "completions/mean_terminated_length": 379.76190185546875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.4496, "grad_norm": 2.861811637878418, "kl": 0.030670166015625, "learning_rate": 1e-06, "loss": -0.0581, "num_tokens": 7414069.0, "reward": 0.09261719137430191, "reward_std": 0.046390384435653687, "rewards/bleu_reward_func/mean": 0.09261719137430191, "rewards/bleu_reward_func/std": 0.14345434308052063, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 372.53125, "completions/mean_terminated_length": 277.1052551269531, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.4504, "grad_norm": 3.737154960632324, "kl": 0.07086181640625, "learning_rate": 1e-06, "loss": -0.0085, "num_tokens": 7433038.0, "reward": 0.13954411447048187, "reward_std": 0.09964635223150253, "rewards/bleu_reward_func/mean": 0.13954411447048187, "rewards/bleu_reward_func/std": 0.2269161492586136, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 175.42857360839844, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.4512, "grad_norm": 9.693827629089355, "kl": 0.2703857421875, "learning_rate": 1e-06, "loss": 0.0569, "num_tokens": 7442590.0, "reward": 0.08689892292022705, "reward_std": 0.046516068279743195, "rewards/bleu_reward_func/mean": 0.08689892292022705, "rewards/bleu_reward_func/std": 0.09460947662591934, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 344.3125, "completions/mean_terminated_length": 320.3571472167969, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.452, "grad_norm": 3.287851333618164, "kl": 0.03594970703125, "learning_rate": 1e-06, "loss": -0.0888, "num_tokens": 7455856.0, "reward": 0.09177221357822418, "reward_std": 0.02658715285360813, "rewards/bleu_reward_func/mean": 0.09177221357822418, "rewards/bleu_reward_func/std": 0.04939228668808937, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 141.09091186523438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4528, "grad_norm": 5.329028129577637, "kl": 0.21087646484375, "learning_rate": 1e-06, "loss": -0.0318, "num_tokens": 7471128.0, "reward": 0.30248120427131653, "reward_std": 0.045193642377853394, "rewards/bleu_reward_func/mean": 0.30248120427131653, "rewards/bleu_reward_func/std": 0.09429154545068741, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 396.5625, "completions/mean_terminated_length": 306.77777099609375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4536, "grad_norm": 2.8240556716918945, "kl": 0.029327392578125, "learning_rate": 1e-06, "loss": -0.006, "num_tokens": 7486778.0, "reward": 0.046158432960510254, "reward_std": 0.012592589482665062, "rewards/bleu_reward_func/mean": 0.046158432960510254, "rewards/bleu_reward_func/std": 0.0691753551363945, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 180.00001525878906, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.4544, "grad_norm": 5.554408073425293, "kl": 0.077850341796875, "learning_rate": 1e-06, "loss": -0.0114, "num_tokens": 7501522.0, "reward": 0.19211658835411072, "reward_std": 0.052228912711143494, "rewards/bleu_reward_func/mean": 0.19211658835411072, "rewards/bleu_reward_func/std": 0.12220965325832367, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 282.90625, "completions/mean_terminated_length": 104.72222137451172, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.4552, "grad_norm": 8.932016372680664, "kl": 0.1943359375, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 7515511.0, "reward": 0.08466814458370209, "reward_std": 0.03040888160467148, "rewards/bleu_reward_func/mean": 0.08466814458370209, "rewards/bleu_reward_func/std": 0.07005324959754944, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 450.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.456, "grad_norm": 1.950373888015747, "kl": 0.0325927734375, "learning_rate": 1e-06, "loss": 0.0603, "num_tokens": 7535791.0, "reward": 0.06426975131034851, "reward_std": 0.02304723486304283, "rewards/bleu_reward_func/mean": 0.06426975131034851, "rewards/bleu_reward_func/std": 0.04708797112107277, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 200.03125, "completions/mean_terminated_length": 167.7586212158203, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.4568, "grad_norm": 8.692915916442871, "kl": 0.300445556640625, "learning_rate": 1e-06, "loss": 0.179, "num_tokens": 7548720.0, "reward": 0.16858291625976562, "reward_std": 0.04772442951798439, "rewards/bleu_reward_func/mean": 0.16858291625976562, "rewards/bleu_reward_func/std": 0.187880739569664, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 221.4375, "completions/mean_terminated_length": 154.38462829589844, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4576, "grad_norm": 5.559481143951416, "kl": 0.2091064453125, "learning_rate": 1e-06, "loss": 0.1461, "num_tokens": 7559926.0, "reward": 0.2749570608139038, "reward_std": 0.07935648411512375, "rewards/bleu_reward_func/mean": 0.2749570608139038, "rewards/bleu_reward_func/std": 0.20695801079273224, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 227.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.4584, "grad_norm": 4.713781833648682, "kl": 0.10894775390625, "learning_rate": 1e-06, "loss": -0.1979, "num_tokens": 7573214.0, "reward": 0.11424913257360458, "reward_std": 0.0238350722938776, "rewards/bleu_reward_func/mean": 0.11424913257360458, "rewards/bleu_reward_func/std": 0.1513095498085022, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 231.1666717529297, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.4592, "grad_norm": 6.019801616668701, "kl": 0.117706298828125, "learning_rate": 1e-06, "loss": -0.0467, "num_tokens": 7584922.0, "reward": 0.12773753702640533, "reward_std": 0.03902646526694298, "rewards/bleu_reward_func/mean": 0.12773753702640533, "rewards/bleu_reward_func/std": 0.08676618337631226, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 261.3125, "completions/mean_terminated_length": 40.11764907836914, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.46, "grad_norm": 20.097490310668945, "kl": 0.32666015625, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 7599564.0, "reward": 0.1775631606578827, "reward_std": 0.05471285060048103, "rewards/bleu_reward_func/mean": 0.1775631606578827, "rewards/bleu_reward_func/std": 0.1462731659412384, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 224.59375, "completions/mean_terminated_length": 93.95455169677734, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4608, "grad_norm": 12.845754623413086, "kl": 0.2415771484375, "learning_rate": 1e-06, "loss": 0.1681, "num_tokens": 7609343.0, "reward": 0.10711174458265305, "reward_std": 0.03790780156850815, "rewards/bleu_reward_func/mean": 0.10711174458265305, "rewards/bleu_reward_func/std": 0.11842114478349686, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 429.15625, "completions/mean_terminated_length": 217.44444274902344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.4616, "grad_norm": 3.16619873046875, "kl": 0.031341552734375, "learning_rate": 1e-06, "loss": -0.1864, "num_tokens": 7628708.0, "reward": 0.12277669459581375, "reward_std": 0.030532412230968475, "rewards/bleu_reward_func/mean": 0.12277669459581375, "rewards/bleu_reward_func/std": 0.14895910024642944, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 100.63999938964844, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.4624, "grad_norm": 6.808165550231934, "kl": 0.2611083984375, "learning_rate": 1e-06, "loss": -0.043, "num_tokens": 7637504.0, "reward": 0.06198694184422493, "reward_std": 0.018319500610232353, "rewards/bleu_reward_func/mean": 0.06198694184422493, "rewards/bleu_reward_func/std": 0.05399094894528389, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 306.65625, "completions/mean_terminated_length": 125.47058868408203, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4632, "grad_norm": 6.689248085021973, "kl": 0.165435791015625, "learning_rate": 1e-06, "loss": 0.0301, "num_tokens": 7651797.0, "reward": 0.07045552134513855, "reward_std": 0.018690217286348343, "rewards/bleu_reward_func/mean": 0.07045552134513855, "rewards/bleu_reward_func/std": 0.07137548923492432, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 332.6875, "completions/mean_terminated_length": 193.22222900390625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.464, "grad_norm": 4.129982948303223, "kl": 0.050750732421875, "learning_rate": 1e-06, "loss": 0.2464, "num_tokens": 7667787.0, "reward": 0.09383320808410645, "reward_std": 0.046889662742614746, "rewards/bleu_reward_func/mean": 0.09383320808410645, "rewards/bleu_reward_func/std": 0.10615876317024231, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 42.71999740600586, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.4648, "grad_norm": 9.511686325073242, "kl": 0.385894775390625, "learning_rate": 1e-06, "loss": 0.0501, "num_tokens": 7679535.0, "reward": 0.08031031489372253, "reward_std": 0.036660827696323395, "rewards/bleu_reward_func/mean": 0.08031031489372253, "rewards/bleu_reward_func/std": 0.07939815521240234, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 294.0, "completions/mean_terminated_length": 271.4482727050781, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.4656, "grad_norm": 7.2719950675964355, "kl": 0.149932861328125, "learning_rate": 1e-06, "loss": 0.0885, "num_tokens": 7690815.0, "reward": 0.11769823729991913, "reward_std": 0.02824997529387474, "rewards/bleu_reward_func/mean": 0.11769823729991913, "rewards/bleu_reward_func/std": 0.12788043916225433, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 120.15625, "completions/mean_terminated_length": 120.15625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4664, "grad_norm": 11.640869140625, "kl": 0.19036865234375, "learning_rate": 1e-06, "loss": 0.1164, "num_tokens": 7698172.0, "reward": 0.052216824144124985, "reward_std": 0.015741443261504173, "rewards/bleu_reward_func/mean": 0.052216824144124985, "rewards/bleu_reward_func/std": 0.01899011991918087, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 236.46875, "completions/mean_terminated_length": 144.625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4672, "grad_norm": 7.708470821380615, "kl": 0.25323486328125, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 7711683.0, "reward": 0.20987500250339508, "reward_std": 0.050422437489032745, "rewards/bleu_reward_func/mean": 0.20987500250339508, "rewards/bleu_reward_func/std": 0.21432380378246307, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 255.09375, "completions/mean_terminated_length": 120.52381134033203, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.468, "grad_norm": 8.178823471069336, "kl": 0.181243896484375, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 7721638.0, "reward": 0.11023026704788208, "reward_std": 0.03732236102223396, "rewards/bleu_reward_func/mean": 0.11023026704788208, "rewards/bleu_reward_func/std": 0.06018221378326416, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 209.40625, "completions/mean_terminated_length": 124.68000030517578, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4688, "grad_norm": 6.163815498352051, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": -0.382, "num_tokens": 7731483.0, "reward": 0.022579330950975418, "reward_std": 0.024172717705368996, "rewards/bleu_reward_func/mean": 0.022579330950975418, "rewards/bleu_reward_func/std": 0.03154170513153076, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 179.28125, "completions/mean_terminated_length": 131.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.4696, "grad_norm": 24.94843864440918, "kl": 0.303466796875, "learning_rate": 1e-06, "loss": -0.149, "num_tokens": 7740828.0, "reward": 0.062010329216718674, "reward_std": 0.030193448066711426, "rewards/bleu_reward_func/mean": 0.062010329216718674, "rewards/bleu_reward_func/std": 0.04090145602822304, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 372.0625, "completions/mean_terminated_length": 276.3157958984375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.4704, "grad_norm": 2.5675299167633057, "kl": 0.03485107421875, "learning_rate": 1e-06, "loss": 0.0333, "num_tokens": 7757862.0, "reward": 0.037547022104263306, "reward_std": 0.01179808471351862, "rewards/bleu_reward_func/mean": 0.037547022104263306, "rewards/bleu_reward_func/std": 0.03366583213210106, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 268.875, "completions/mean_terminated_length": 223.8518524169922, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.4712, "grad_norm": 9.238602638244629, "kl": 0.09326171875, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 7769098.0, "reward": 0.0967094898223877, "reward_std": 0.041084855794906616, "rewards/bleu_reward_func/mean": 0.0967094898223877, "rewards/bleu_reward_func/std": 0.10235904902219772, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 271.71875, "completions/mean_terminated_length": 107.31578826904297, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.472, "grad_norm": 12.115531921386719, "kl": 0.196563720703125, "learning_rate": 1e-06, "loss": 0.3242, "num_tokens": 7782097.0, "reward": 0.19325336813926697, "reward_std": 0.0921676903963089, "rewards/bleu_reward_func/mean": 0.19325336813926697, "rewards/bleu_reward_func/std": 0.24508582055568695, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 197.59375, "completions/mean_terminated_length": 176.6333465576172, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.4728, "grad_norm": 9.336063385009766, "kl": 0.17633056640625, "learning_rate": 1e-06, "loss": 0.0424, "num_tokens": 7793588.0, "reward": 0.14900264143943787, "reward_std": 0.06498396396636963, "rewards/bleu_reward_func/mean": 0.14900264143943787, "rewards/bleu_reward_func/std": 0.13959822058677673, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 213.96875, "completions/mean_terminated_length": 145.1923065185547, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.4736, "grad_norm": 40.333492279052734, "kl": 0.1762237548828125, "learning_rate": 1e-06, "loss": 0.034, "num_tokens": 7809987.0, "reward": 0.11991982161998749, "reward_std": 0.024838652461767197, "rewards/bleu_reward_func/mean": 0.11991982161998749, "rewards/bleu_reward_func/std": 0.13350419700145721, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 349.46875, "completions/mean_terminated_length": 295.29168701171875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.4744, "grad_norm": 6.914525032043457, "kl": 0.02703857421875, "learning_rate": 1e-06, "loss": 0.1437, "num_tokens": 7826610.0, "reward": 0.02816709131002426, "reward_std": 0.015584287233650684, "rewards/bleu_reward_func/mean": 0.02816709131002426, "rewards/bleu_reward_func/std": 0.027631772682070732, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 189.28125, "completions/mean_terminated_length": 178.87095642089844, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4752, "grad_norm": 26.64930534362793, "kl": 0.619964599609375, "learning_rate": 1e-06, "loss": 0.0753, "num_tokens": 7834363.0, "reward": 0.1504502296447754, "reward_std": 0.061798207461833954, "rewards/bleu_reward_func/mean": 0.1504502296447754, "rewards/bleu_reward_func/std": 0.1269664466381073, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 317.125, "completions/mean_terminated_length": 200.1999969482422, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.476, "grad_norm": 8.9805908203125, "kl": 0.294097900390625, "learning_rate": 1e-06, "loss": -0.0704, "num_tokens": 7849831.0, "reward": 0.13049980998039246, "reward_std": 0.02749776840209961, "rewards/bleu_reward_func/mean": 0.13049980998039246, "rewards/bleu_reward_func/std": 0.109443299472332, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 192.1666717529297, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.4768, "grad_norm": 4.8097429275512695, "kl": 0.08526611328125, "learning_rate": 1e-06, "loss": -0.1088, "num_tokens": 7861347.0, "reward": 0.0661308616399765, "reward_std": 0.02051004208624363, "rewards/bleu_reward_func/mean": 0.0661308616399765, "rewards/bleu_reward_func/std": 0.05708196386694908, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 120.47999572753906, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.4776, "grad_norm": 7.351487636566162, "kl": 0.334259033203125, "learning_rate": 1e-06, "loss": 0.0519, "num_tokens": 7871295.0, "reward": 0.16019710898399353, "reward_std": 0.03656643629074097, "rewards/bleu_reward_func/mean": 0.16019710898399353, "rewards/bleu_reward_func/std": 0.19289268553256989, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 349.3125, "completions/mean_terminated_length": 186.625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4784, "grad_norm": 4.930379867553711, "kl": 0.181121826171875, "learning_rate": 1e-06, "loss": -0.0202, "num_tokens": 7887577.0, "reward": 0.1285662055015564, "reward_std": 0.03015293926000595, "rewards/bleu_reward_func/mean": 0.1285662055015564, "rewards/bleu_reward_func/std": 0.08600351959466934, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 153.4666748046875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.4792, "grad_norm": 15.110285758972168, "kl": 0.34112548828125, "learning_rate": 1e-06, "loss": -0.1173, "num_tokens": 7898453.0, "reward": 0.09940430521965027, "reward_std": 0.046547506004571915, "rewards/bleu_reward_func/mean": 0.09940430521965027, "rewards/bleu_reward_func/std": 0.05020095780491829, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 334.03125, "completions/mean_terminated_length": 315.6206970214844, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.48, "grad_norm": 3.0496630668640137, "kl": 0.03216552734375, "learning_rate": 1e-06, "loss": -0.1382, "num_tokens": 7911190.0, "reward": 0.08442967385053635, "reward_std": 0.027117565274238586, "rewards/bleu_reward_func/mean": 0.08442967385053635, "rewards/bleu_reward_func/std": 0.07272256910800934, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 132.125, "completions/mean_terminated_length": 119.87096405029297, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4808, "grad_norm": 9.77161693572998, "kl": 0.3096923828125, "learning_rate": 1e-06, "loss": 0.0712, "num_tokens": 7923114.0, "reward": 0.19400227069854736, "reward_std": 0.08562377095222473, "rewards/bleu_reward_func/mean": 0.19400227069854736, "rewards/bleu_reward_func/std": 0.18403199315071106, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 105.31578826904297, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.4816, "grad_norm": 5.418551445007324, "kl": 0.132049560546875, "learning_rate": 1e-06, "loss": 0.057, "num_tokens": 7937747.0, "reward": 0.15049128234386444, "reward_std": 0.024429049342870712, "rewards/bleu_reward_func/mean": 0.15049128234386444, "rewards/bleu_reward_func/std": 0.1750853955745697, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 295.46875, "completions/mean_terminated_length": 104.4117660522461, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.4824, "grad_norm": 4.260025501251221, "kl": 0.074981689453125, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 7951266.0, "reward": 0.0902150496840477, "reward_std": 0.0313844196498394, "rewards/bleu_reward_func/mean": 0.0902150496840477, "rewards/bleu_reward_func/std": 0.09559616446495056, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 255.36842346191406, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4832, "grad_norm": 3.981938362121582, "kl": 0.05792236328125, "learning_rate": 1e-06, "loss": 0.068, "num_tokens": 7967510.0, "reward": 0.15250109136104584, "reward_std": 0.050509147346019745, "rewards/bleu_reward_func/mean": 0.15250109136104584, "rewards/bleu_reward_func/std": 0.2119276374578476, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 205.2857208251953, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.484, "grad_norm": 4.506271839141846, "kl": 0.133819580078125, "learning_rate": 1e-06, "loss": 0.0768, "num_tokens": 7979194.0, "reward": 0.049993276596069336, "reward_std": 0.01375819742679596, "rewards/bleu_reward_func/mean": 0.049993276596069336, "rewards/bleu_reward_func/std": 0.019665135070681572, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 184.90625, "completions/mean_terminated_length": 124.33333587646484, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4848, "grad_norm": 13.16498851776123, "kl": 0.163360595703125, "learning_rate": 1e-06, "loss": 0.2145, "num_tokens": 7992215.0, "reward": 0.16849525272846222, "reward_std": 0.041973263025283813, "rewards/bleu_reward_func/mean": 0.16849525272846222, "rewards/bleu_reward_func/std": 0.11670318245887756, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 278.3125, "completions/mean_terminated_length": 278.3125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.4856, "grad_norm": 9.305381774902344, "kl": 0.1671142578125, "learning_rate": 1e-06, "loss": 0.0542, "num_tokens": 8003561.0, "reward": 0.14806249737739563, "reward_std": 0.04475884884595871, "rewards/bleu_reward_func/mean": 0.14806249737739563, "rewards/bleu_reward_func/std": 0.10317616909742355, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 392.3125, "completions/mean_terminated_length": 320.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4864, "grad_norm": 8.050078392028809, "kl": 0.0286865234375, "learning_rate": 1e-06, "loss": -0.0652, "num_tokens": 8020771.0, "reward": 0.09127211570739746, "reward_std": 0.02751500904560089, "rewards/bleu_reward_func/mean": 0.09127211570739746, "rewards/bleu_reward_func/std": 0.04517889395356178, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 285.93548583984375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.4872, "grad_norm": 9.551247596740723, "kl": 0.05633544921875, "learning_rate": 1e-06, "loss": 0.0629, "num_tokens": 8033227.0, "reward": 0.07062816619873047, "reward_std": 0.032938919961452484, "rewards/bleu_reward_func/mean": 0.07062816619873047, "rewards/bleu_reward_func/std": 0.05320809781551361, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 184.92308044433594, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.488, "grad_norm": 28.349098205566406, "kl": 0.06927490234375, "learning_rate": 1e-06, "loss": 0.1921, "num_tokens": 8048579.0, "reward": 0.12640823423862457, "reward_std": 0.028129609301686287, "rewards/bleu_reward_func/mean": 0.12640823423862457, "rewards/bleu_reward_func/std": 0.12890547513961792, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 334.28125, "completions/mean_terminated_length": 241.1904754638672, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.4888, "grad_norm": 6.840433597564697, "kl": 0.054046630859375, "learning_rate": 1e-06, "loss": 0.1175, "num_tokens": 8063660.0, "reward": 0.06585729867219925, "reward_std": 0.016829343512654305, "rewards/bleu_reward_func/mean": 0.06585729867219925, "rewards/bleu_reward_func/std": 0.027104271575808525, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 469.4375, "completions/mean_terminated_length": 443.8999938964844, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.4896, "grad_norm": 2.304220199584961, "kl": 0.029449462890625, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 8081922.0, "reward": 0.029903851449489594, "reward_std": 0.007851570844650269, "rewards/bleu_reward_func/mean": 0.029903851449489594, "rewards/bleu_reward_func/std": 0.017835307866334915, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 187.61289978027344, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.4904, "grad_norm": 5.66249418258667, "kl": 0.0855712890625, "learning_rate": 1e-06, "loss": 0.1286, "num_tokens": 8091010.0, "reward": 0.2724965810775757, "reward_std": 0.06183997541666031, "rewards/bleu_reward_func/mean": 0.2724965810775757, "rewards/bleu_reward_func/std": 0.2708708643913269, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 328.71875, "completions/mean_terminated_length": 186.1666717529297, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.4912, "grad_norm": 3.228691816329956, "kl": 0.0655670166015625, "learning_rate": 1e-06, "loss": 0.0338, "num_tokens": 8105809.0, "reward": 0.12389599531888962, "reward_std": 0.07396578788757324, "rewards/bleu_reward_func/mean": 0.12389599531888962, "rewards/bleu_reward_func/std": 0.18483103811740875, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 317.59375, "completions/mean_terminated_length": 215.76190185546875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.492, "grad_norm": 3.5793278217315674, "kl": 0.044097900390625, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 8118620.0, "reward": 0.08205534517765045, "reward_std": 0.032849013805389404, "rewards/bleu_reward_func/mean": 0.08205534517765045, "rewards/bleu_reward_func/std": 0.05394502356648445, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 180.9375, "completions/mean_terminated_length": 180.9375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.4928, "grad_norm": 7.083518028259277, "kl": 0.40167236328125, "learning_rate": 1e-06, "loss": -0.1081, "num_tokens": 8128922.0, "reward": 0.12701216340065002, "reward_std": 0.03847620263695717, "rewards/bleu_reward_func/mean": 0.12701216340065002, "rewards/bleu_reward_func/std": 0.08405326306819916, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 145.53846740722656, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.4936, "grad_norm": 6.532368183135986, "kl": 0.239410400390625, "learning_rate": 1e-06, "loss": -0.0347, "num_tokens": 8143162.0, "reward": 0.11757355183362961, "reward_std": 0.02820819616317749, "rewards/bleu_reward_func/mean": 0.11757355183362961, "rewards/bleu_reward_func/std": 0.10728771984577179, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 364.375, "completions/mean_terminated_length": 297.2727355957031, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.4944, "grad_norm": 2.549912929534912, "kl": 0.044189453125, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 8158198.0, "reward": 0.04174516722559929, "reward_std": 0.011650302447378635, "rewards/bleu_reward_func/mean": 0.04174516722559929, "rewards/bleu_reward_func/std": 0.03221089020371437, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 311.84375, "completions/mean_terminated_length": 207.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.4952, "grad_norm": 3.869034767150879, "kl": 0.05462646484375, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 8171945.0, "reward": 0.028928130865097046, "reward_std": 0.012434298172593117, "rewards/bleu_reward_func/mean": 0.028928130865097046, "rewards/bleu_reward_func/std": 0.025789210572838783, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 243.90625, "completions/mean_terminated_length": 205.60714721679688, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.496, "grad_norm": 6.46402645111084, "kl": 0.304168701171875, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 8181710.0, "reward": 0.13130733370780945, "reward_std": 0.018212325870990753, "rewards/bleu_reward_func/mean": 0.13130733370780945, "rewards/bleu_reward_func/std": 0.09850703179836273, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 203.42857360839844, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.4968, "grad_norm": 5.473258018493652, "kl": 0.16949462890625, "learning_rate": 1e-06, "loss": -0.0354, "num_tokens": 8193118.0, "reward": 0.2502046227455139, "reward_std": 0.03522457554936409, "rewards/bleu_reward_func/mean": 0.2502046227455139, "rewards/bleu_reward_func/std": 0.2565787732601166, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 236.71875, "completions/mean_terminated_length": 144.95834350585938, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.4976, "grad_norm": 30.692569732666016, "kl": 0.29986572265625, "learning_rate": 1e-06, "loss": -0.0477, "num_tokens": 8207549.0, "reward": 0.22225125133991241, "reward_std": 0.033524345606565475, "rewards/bleu_reward_func/mean": 0.22225125133991241, "rewards/bleu_reward_func/std": 0.19432197511196136, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 120.96875, "completions/mean_terminated_length": 80.51724243164062, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.4984, "grad_norm": 7.908195495605469, "kl": 0.380523681640625, "learning_rate": 1e-06, "loss": -0.0756, "num_tokens": 8218324.0, "reward": 0.23351526260375977, "reward_std": 0.05452558770775795, "rewards/bleu_reward_func/mean": 0.23351526260375977, "rewards/bleu_reward_func/std": 0.1365489512681961, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 172.5625, "completions/mean_terminated_length": 137.44827270507812, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.4992, "grad_norm": 8.18444538116455, "kl": 0.296051025390625, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 8228742.0, "reward": 0.14677512645721436, "reward_std": 0.04820986092090607, "rewards/bleu_reward_func/mean": 0.14677512645721436, "rewards/bleu_reward_func/std": 0.15982075035572052, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 333.65625, "completions/mean_terminated_length": 263.86956787109375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.5, "grad_norm": 15.107973098754883, "kl": 0.160400390625, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 8245059.0, "reward": 0.13298457860946655, "reward_std": 0.018914809450507164, "rewards/bleu_reward_func/mean": 0.13298457860946655, "rewards/bleu_reward_func/std": 0.07686522603034973, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 299.53125, "completions/mean_terminated_length": 260.1851806640625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5008, "grad_norm": 131.54771423339844, "kl": 0.102081298828125, "learning_rate": 1e-06, "loss": 0.1782, "num_tokens": 8256620.0, "reward": 0.1039574146270752, "reward_std": 0.03130800276994705, "rewards/bleu_reward_func/mean": 0.1039574146270752, "rewards/bleu_reward_func/std": 0.05177094042301178, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 202.15625, "completions/mean_terminated_length": 181.50001525878906, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.5016, "grad_norm": 8.109158515930176, "kl": 0.213104248046875, "learning_rate": 1e-06, "loss": -0.0456, "num_tokens": 8266585.0, "reward": 0.23561137914657593, "reward_std": 0.03910698741674423, "rewards/bleu_reward_func/mean": 0.23561137914657593, "rewards/bleu_reward_func/std": 0.1352909654378891, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 259.5625, "completions/mean_terminated_length": 144.8181915283203, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5024, "grad_norm": 5.162299633026123, "kl": 0.147857666015625, "learning_rate": 1e-06, "loss": -0.0329, "num_tokens": 8278827.0, "reward": 0.1635468751192093, "reward_std": 0.04077983647584915, "rewards/bleu_reward_func/mean": 0.1635468751192093, "rewards/bleu_reward_func/std": 0.1520238220691681, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 354.84375, "completions/mean_terminated_length": 302.4583435058594, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.5032, "grad_norm": 3.0983669757843018, "kl": 0.0435791015625, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 8293502.0, "reward": 0.05737052857875824, "reward_std": 0.021961018443107605, "rewards/bleu_reward_func/mean": 0.05737052857875824, "rewards/bleu_reward_func/std": 0.03505769371986389, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 185.65516662597656, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.504, "grad_norm": 6.821644306182861, "kl": 0.0950927734375, "learning_rate": 1e-06, "loss": -0.0482, "num_tokens": 8302182.0, "reward": 0.07243853062391281, "reward_std": 0.06683069467544556, "rewards/bleu_reward_func/mean": 0.07243853062391281, "rewards/bleu_reward_func/std": 0.10312769562005997, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 352.34375, "completions/mean_terminated_length": 329.5357360839844, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5048, "grad_norm": 3.5717883110046387, "kl": 0.036041259765625, "learning_rate": 1e-06, "loss": 0.1298, "num_tokens": 8318753.0, "reward": 0.026654381304979324, "reward_std": 0.024883870035409927, "rewards/bleu_reward_func/mean": 0.026654381304979324, "rewards/bleu_reward_func/std": 0.03104417398571968, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 183.09375, "completions/mean_terminated_length": 183.09375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.5056, "grad_norm": 6.945363998413086, "kl": 0.234039306640625, "learning_rate": 1e-06, "loss": 0.0696, "num_tokens": 8329596.0, "reward": 0.18802031874656677, "reward_std": 0.06351514160633087, "rewards/bleu_reward_func/mean": 0.18802031874656677, "rewards/bleu_reward_func/std": 0.16961929202079773, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 157.9375, "completions/mean_terminated_length": 134.33334350585938, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.5064, "grad_norm": 99.53560638427734, "kl": 0.190399169921875, "learning_rate": 1e-06, "loss": 0.0621, "num_tokens": 8343090.0, "reward": 0.06075248867273331, "reward_std": 0.018952492624521255, "rewards/bleu_reward_func/mean": 0.06075248867273331, "rewards/bleu_reward_func/std": 0.057455144822597504, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 291.71875, "completions/mean_terminated_length": 268.9310302734375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5072, "grad_norm": 6.928218364715576, "kl": 0.155853271484375, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 8354305.0, "reward": 0.15043729543685913, "reward_std": 0.04871266707777977, "rewards/bleu_reward_func/mean": 0.15043729543685913, "rewards/bleu_reward_func/std": 0.17611344158649445, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 198.5625, "completions/mean_terminated_length": 177.6666717529297, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.508, "grad_norm": 4.697340488433838, "kl": 0.094085693359375, "learning_rate": 1e-06, "loss": 0.0737, "num_tokens": 8364091.0, "reward": 0.06601180136203766, "reward_std": 0.02227596938610077, "rewards/bleu_reward_func/mean": 0.06601180136203766, "rewards/bleu_reward_func/std": 0.043257758021354675, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 410.78125, "completions/mean_terminated_length": 332.0555725097656, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5088, "grad_norm": 77.46025085449219, "kl": 0.05377197265625, "learning_rate": 1e-06, "loss": -0.1447, "num_tokens": 8381724.0, "reward": 0.03671726584434509, "reward_std": 0.015178699977695942, "rewards/bleu_reward_func/mean": 0.03671726584434509, "rewards/bleu_reward_func/std": 0.03225603699684143, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 154.5, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5096, "grad_norm": 6.136791706085205, "kl": 0.124664306640625, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 8397476.0, "reward": 0.16264644265174866, "reward_std": 0.03679278865456581, "rewards/bleu_reward_func/mean": 0.16264644265174866, "rewards/bleu_reward_func/std": 0.16195148229599, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 179.57144165039062, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.5104, "grad_norm": 4.400974750518799, "kl": 0.058013916015625, "learning_rate": 1e-06, "loss": -0.0244, "num_tokens": 8406544.0, "reward": 0.059636689722537994, "reward_std": 0.024229735136032104, "rewards/bleu_reward_func/mean": 0.059636689722537994, "rewards/bleu_reward_func/std": 0.04718983918428421, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 287.09375, "completions/mean_terminated_length": 235.19232177734375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.5112, "grad_norm": 10.916999816894531, "kl": 0.118011474609375, "learning_rate": 1e-06, "loss": 0.0766, "num_tokens": 8422323.0, "reward": 0.10468322038650513, "reward_std": 0.021623361855745316, "rewards/bleu_reward_func/mean": 0.10468322038650513, "rewards/bleu_reward_func/std": 0.08197237551212311, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 336.28125, "completions/mean_terminated_length": 311.1785888671875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.512, "grad_norm": 2.1528663635253906, "kl": 0.033172607421875, "learning_rate": 1e-06, "loss": -0.0722, "num_tokens": 8437100.0, "reward": 0.10049737989902496, "reward_std": 0.03208357095718384, "rewards/bleu_reward_func/mean": 0.10049737989902496, "rewards/bleu_reward_func/std": 0.0739847868680954, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 361.375, "completions/mean_terminated_length": 190.6666717529297, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5128, "grad_norm": 4.406239986419678, "kl": 0.163421630859375, "learning_rate": 1e-06, "loss": 0.0712, "num_tokens": 8450184.0, "reward": 0.06270510703325272, "reward_std": 0.01890096440911293, "rewards/bleu_reward_func/mean": 0.06270510703325272, "rewards/bleu_reward_func/std": 0.04299367591738701, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 344.75, "completions/mean_terminated_length": 268.727294921875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.5136, "grad_norm": 3.8396599292755127, "kl": 0.21343994140625, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 8464952.0, "reward": 0.12920798361301422, "reward_std": 0.025508491322398186, "rewards/bleu_reward_func/mean": 0.12920798361301422, "rewards/bleu_reward_func/std": 0.11343086510896683, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 243.0625, "completions/mean_terminated_length": 204.6428680419922, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5144, "grad_norm": 9.709904670715332, "kl": 0.127349853515625, "learning_rate": 1e-06, "loss": -0.0569, "num_tokens": 8481010.0, "reward": 0.08636625856161118, "reward_std": 0.023468628525733948, "rewards/bleu_reward_func/mean": 0.08636625856161118, "rewards/bleu_reward_func/std": 0.11172276735305786, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 319.8125, "completions/mean_terminated_length": 150.23529052734375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.5152, "grad_norm": 4.022066116333008, "kl": 0.095428466796875, "learning_rate": 1e-06, "loss": 0.0192, "num_tokens": 8495668.0, "reward": 0.13581258058547974, "reward_std": 0.049042053520679474, "rewards/bleu_reward_func/mean": 0.13581258058547974, "rewards/bleu_reward_func/std": 0.10865607112646103, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 200.40625, "completions/mean_terminated_length": 113.15999603271484, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.516, "grad_norm": 14.371395111083984, "kl": 0.188232421875, "learning_rate": 1e-06, "loss": 0.1384, "num_tokens": 8510193.0, "reward": 0.16988492012023926, "reward_std": 0.02835988998413086, "rewards/bleu_reward_func/mean": 0.16988492012023926, "rewards/bleu_reward_func/std": 0.22432467341423035, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 350.625, "completions/mean_terminated_length": 189.25, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.5168, "grad_norm": 3.603522539138794, "kl": 0.106658935546875, "learning_rate": 1e-06, "loss": 0.0683, "num_tokens": 8527613.0, "reward": 0.04950277507305145, "reward_std": 0.02557562291622162, "rewards/bleu_reward_func/mean": 0.04950277507305145, "rewards/bleu_reward_func/std": 0.036064986139535904, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 240.71875, "completions/mean_terminated_length": 190.48147583007812, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.5176, "grad_norm": 2.1311533451080322, "kl": 0.040771484375, "learning_rate": 1e-06, "loss": -0.0675, "num_tokens": 8540012.0, "reward": 0.4242175817489624, "reward_std": 0.05443207919597626, "rewards/bleu_reward_func/mean": 0.4242175817489624, "rewards/bleu_reward_func/std": 0.3835957646369934, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 162.78125, "completions/mean_terminated_length": 82.19231414794922, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.5184, "grad_norm": 7.217045783996582, "kl": 0.238250732421875, "learning_rate": 1e-06, "loss": -0.2492, "num_tokens": 8550285.0, "reward": 0.15483121573925018, "reward_std": 0.04074571654200554, "rewards/bleu_reward_func/mean": 0.15483121573925018, "rewards/bleu_reward_func/std": 0.1628112941980362, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 226.03125, "completions/mean_terminated_length": 196.44827270507812, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5192, "grad_norm": 9.426239013671875, "kl": 0.326446533203125, "learning_rate": 1e-06, "loss": -0.0078, "num_tokens": 8562006.0, "reward": 0.22631683945655823, "reward_std": 0.046764910221099854, "rewards/bleu_reward_func/mean": 0.22631683945655823, "rewards/bleu_reward_func/std": 0.24870522320270538, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 408.4375, "completions/mean_terminated_length": 373.91668701171875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.52, "grad_norm": 2.5924437046051025, "kl": 0.03192138671875, "learning_rate": 1e-06, "loss": -0.0517, "num_tokens": 8577388.0, "reward": 0.050332337617874146, "reward_std": 0.013445645570755005, "rewards/bleu_reward_func/mean": 0.050332337617874146, "rewards/bleu_reward_func/std": 0.04263650253415108, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 323.65625, "completions/mean_terminated_length": 296.75, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.5208, "grad_norm": 6.97017765045166, "kl": 0.062255859375, "learning_rate": 1e-06, "loss": 0.075, "num_tokens": 8589889.0, "reward": 0.0671861320734024, "reward_std": 0.020000552758574486, "rewards/bleu_reward_func/mean": 0.0671861320734024, "rewards/bleu_reward_func/std": 0.027637863531708717, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 202.6875, "completions/mean_terminated_length": 170.6896514892578, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.5216, "grad_norm": 5.9939727783203125, "kl": 0.30169677734375, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 8603471.0, "reward": 0.23086336255073547, "reward_std": 0.03887036070227623, "rewards/bleu_reward_func/mean": 0.23086336255073547, "rewards/bleu_reward_func/std": 0.1954699456691742, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 334.09375, "completions/mean_terminated_length": 264.478271484375, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.5224, "grad_norm": 10.721747398376465, "kl": 0.11480712890625, "learning_rate": 1e-06, "loss": -0.0181, "num_tokens": 8618802.0, "reward": 0.12768197059631348, "reward_std": 0.018044453114271164, "rewards/bleu_reward_func/mean": 0.12768197059631348, "rewards/bleu_reward_func/std": 0.18208470940589905, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 254.28125, "completions/mean_terminated_length": 182.1199951171875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.5232, "grad_norm": 21.832870483398438, "kl": 0.224090576171875, "learning_rate": 1e-06, "loss": 0.0275, "num_tokens": 8633395.0, "reward": 0.23750805854797363, "reward_std": 0.10584703087806702, "rewards/bleu_reward_func/mean": 0.23750805854797363, "rewards/bleu_reward_func/std": 0.24472850561141968, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 207.8260955810547, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.524, "grad_norm": 14.025049209594727, "kl": 0.1319580078125, "learning_rate": 1e-06, "loss": 0.2269, "num_tokens": 8647583.0, "reward": 0.08922699838876724, "reward_std": 0.022407300770282745, "rewards/bleu_reward_func/mean": 0.08922699838876724, "rewards/bleu_reward_func/std": 0.05691966786980629, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 417.34375, "completions/mean_terminated_length": 374.3182067871094, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5248, "grad_norm": 2.506775379180908, "kl": 0.030517578125, "learning_rate": 1e-06, "loss": -0.1182, "num_tokens": 8664458.0, "reward": 0.056899260729551315, "reward_std": 0.024433575570583344, "rewards/bleu_reward_func/mean": 0.056899260729551315, "rewards/bleu_reward_func/std": 0.043169718235731125, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 251.78125, "completions/mean_terminated_length": 214.60714721679688, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.5256, "grad_norm": 7.267916202545166, "kl": 0.123870849609375, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 8673803.0, "reward": 0.10382385551929474, "reward_std": 0.051886267960071564, "rewards/bleu_reward_func/mean": 0.10382385551929474, "rewards/bleu_reward_func/std": 0.06761174649000168, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 227.34375, "completions/mean_terminated_length": 132.45834350585938, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5264, "grad_norm": 5.546152114868164, "kl": 0.098236083984375, "learning_rate": 1e-06, "loss": -0.0623, "num_tokens": 8685270.0, "reward": 0.1565043181180954, "reward_std": 0.08428065478801727, "rewards/bleu_reward_func/mean": 0.1565043181180954, "rewards/bleu_reward_func/std": 0.17227834463119507, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 152.96875, "completions/mean_terminated_length": 86.48148345947266, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.5272, "grad_norm": 14.263039588928223, "kl": 0.30914306640625, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 8696005.0, "reward": 0.24965888261795044, "reward_std": 0.051375266164541245, "rewards/bleu_reward_func/mean": 0.24965888261795044, "rewards/bleu_reward_func/std": 0.21870571374893188, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 32.5625, "completions/mean_terminated_length": 32.5625, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.528, "grad_norm": 442.6045837402344, "kl": 0.5015869140625, "learning_rate": 1e-06, "loss": 0.0936, "num_tokens": 8704767.0, "reward": 0.13235034048557281, "reward_std": 0.07672514766454697, "rewards/bleu_reward_func/mean": 0.13235034048557281, "rewards/bleu_reward_func/std": 0.13803941011428833, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 252.4375, "completions/mean_terminated_length": 74.84210205078125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.5288, "grad_norm": 5.065097332000732, "kl": 0.18255615234375, "learning_rate": 1e-06, "loss": 0.0464, "num_tokens": 8721005.0, "reward": 0.2883501648902893, "reward_std": 0.022871889173984528, "rewards/bleu_reward_func/mean": 0.2883501648902893, "rewards/bleu_reward_func/std": 0.23920658230781555, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 144.13792419433594, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5296, "grad_norm": 4.1915202140808105, "kl": 0.152069091796875, "learning_rate": 1e-06, "loss": 0.1483, "num_tokens": 8729545.0, "reward": 0.09845434874296188, "reward_std": 0.049190133810043335, "rewards/bleu_reward_func/mean": 0.09845434874296188, "rewards/bleu_reward_func/std": 0.06372099369764328, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 293.9375, "completions/mean_terminated_length": 279.4000244140625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.5304, "grad_norm": 10.675251007080078, "kl": 0.0498046875, "learning_rate": 1e-06, "loss": 0.1798, "num_tokens": 8741895.0, "reward": 0.09067553281784058, "reward_std": 0.036186374723911285, "rewards/bleu_reward_func/mean": 0.09067553281784058, "rewards/bleu_reward_func/std": 0.057389046996831894, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 157.75, "completions/mean_terminated_length": 146.32257080078125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5312, "grad_norm": 42.93239212036133, "kl": 0.20751953125, "learning_rate": 1e-06, "loss": 0.0697, "num_tokens": 8752255.0, "reward": 0.27352431416511536, "reward_std": 0.07531043887138367, "rewards/bleu_reward_func/mean": 0.27352431416511536, "rewards/bleu_reward_func/std": 0.13157765567302704, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 222.239990234375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.532, "grad_norm": 5.36870813369751, "kl": 0.075042724609375, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 8764259.0, "reward": 0.024750784039497375, "reward_std": 0.022341227158904076, "rewards/bleu_reward_func/mean": 0.024750784039497375, "rewards/bleu_reward_func/std": 0.03164950758218765, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 231.0, "completions/mean_terminated_length": 231.0, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5328, "grad_norm": 5.825709342956543, "kl": 0.2059326171875, "learning_rate": 1e-06, "loss": -0.126, "num_tokens": 8776043.0, "reward": 0.09888751804828644, "reward_std": 0.027325943112373352, "rewards/bleu_reward_func/mean": 0.09888751804828644, "rewards/bleu_reward_func/std": 0.06260307133197784, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 154.09375, "completions/mean_terminated_length": 71.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.5336, "grad_norm": 13.691543579101562, "kl": 0.3173828125, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 8787118.0, "reward": 0.16891013085842133, "reward_std": 0.033737391233444214, "rewards/bleu_reward_func/mean": 0.16891013085842133, "rewards/bleu_reward_func/std": 0.1783466339111328, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 258.71875, "completions/mean_terminated_length": 174.2916717529297, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5344, "grad_norm": 5.329594612121582, "kl": 0.25360107421875, "learning_rate": 1e-06, "loss": -0.0877, "num_tokens": 8801157.0, "reward": 0.071600541472435, "reward_std": 0.021211300045251846, "rewards/bleu_reward_func/mean": 0.071600541472435, "rewards/bleu_reward_func/std": 0.054277434945106506, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 300.71875, "completions/mean_terminated_length": 136.38888549804688, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.5352, "grad_norm": 10.509814262390137, "kl": 0.1150054931640625, "learning_rate": 1e-06, "loss": 0.0738, "num_tokens": 8815244.0, "reward": 0.09365338832139969, "reward_std": 0.023113342002034187, "rewards/bleu_reward_func/mean": 0.09365338832139969, "rewards/bleu_reward_func/std": 0.0734696164727211, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 253.9375, "completions/mean_terminated_length": 245.61289978027344, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.536, "grad_norm": 23.27628517150879, "kl": 0.17572021484375, "learning_rate": 1e-06, "loss": -0.1243, "num_tokens": 8826930.0, "reward": 0.10506822168827057, "reward_std": 0.023658432066440582, "rewards/bleu_reward_func/mean": 0.10506822168827057, "rewards/bleu_reward_func/std": 0.06695646047592163, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 447.71875, "completions/mean_terminated_length": 374.86669921875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5368, "grad_norm": 3.2729108333587646, "kl": 0.028778076171875, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 8843889.0, "reward": 0.025088129565119743, "reward_std": 0.00651167519390583, "rewards/bleu_reward_func/mean": 0.025088129565119743, "rewards/bleu_reward_func/std": 0.02992870658636093, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 220.0, "completions/mean_terminated_length": 210.5806427001953, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.5376, "grad_norm": 245.07106018066406, "kl": 0.28985595703125, "learning_rate": 1e-06, "loss": 0.0867, "num_tokens": 8855401.0, "reward": 0.09645279496908188, "reward_std": 0.0731353610754013, "rewards/bleu_reward_func/mean": 0.09645279496908188, "rewards/bleu_reward_func/std": 0.09792789071798325, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 257.0625, "completions/mean_terminated_length": 230.6896514892578, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.5384, "grad_norm": 4.84860897064209, "kl": 0.12005615234375, "learning_rate": 1e-06, "loss": -0.1854, "num_tokens": 8867371.0, "reward": 0.11370459198951721, "reward_std": 0.06978605687618256, "rewards/bleu_reward_func/mean": 0.11370459198951721, "rewards/bleu_reward_func/std": 0.18471869826316833, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 237.78125, "completions/mean_terminated_length": 161.0, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5392, "grad_norm": 33.883243560791016, "kl": 0.1602783203125, "learning_rate": 1e-06, "loss": -0.0528, "num_tokens": 8882588.0, "reward": 0.20929288864135742, "reward_std": 0.04879160225391388, "rewards/bleu_reward_func/mean": 0.20929288864135742, "rewards/bleu_reward_func/std": 0.17186923325061798, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 240.57144165039062, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.54, "grad_norm": 6.54965877532959, "kl": 0.08441162109375, "learning_rate": 1e-06, "loss": -0.037, "num_tokens": 8894508.0, "reward": 0.04714702442288399, "reward_std": 0.010213707573711872, "rewards/bleu_reward_func/mean": 0.04714702442288399, "rewards/bleu_reward_func/std": 0.04436042159795761, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 91.36000061035156, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.5408, "grad_norm": 11.63567066192627, "kl": 0.16424560546875, "learning_rate": 1e-06, "loss": -0.0095, "num_tokens": 8902704.0, "reward": 0.18900102376937866, "reward_std": 0.045921262353658676, "rewards/bleu_reward_func/mean": 0.18900102376937866, "rewards/bleu_reward_func/std": 0.25213196873664856, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 181.96875, "completions/mean_terminated_length": 105.80769348144531, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.5416, "grad_norm": 180.27212524414062, "kl": 0.203125, "learning_rate": 1e-06, "loss": 0.5786, "num_tokens": 8917647.0, "reward": 0.11598189175128937, "reward_std": 0.0453701987862587, "rewards/bleu_reward_func/mean": 0.11598189175128937, "rewards/bleu_reward_func/std": 0.12164945900440216, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 187.59375, "completions/mean_terminated_length": 79.45833587646484, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5424, "grad_norm": 13.031902313232422, "kl": 0.232452392578125, "learning_rate": 1e-06, "loss": 0.1064, "num_tokens": 8927434.0, "reward": 0.2563853859901428, "reward_std": 0.021821634843945503, "rewards/bleu_reward_func/mean": 0.2563853859901428, "rewards/bleu_reward_func/std": 0.25126466155052185, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 157.53125, "completions/mean_terminated_length": 133.90000915527344, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5432, "grad_norm": 37.64163589477539, "kl": 0.17242431640625, "learning_rate": 1e-06, "loss": -0.1013, "num_tokens": 8939683.0, "reward": 0.22671283781528473, "reward_std": 0.05255472660064697, "rewards/bleu_reward_func/mean": 0.22671283781528473, "rewards/bleu_reward_func/std": 0.22751960158348083, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 263.71875, "completions/mean_terminated_length": 238.03448486328125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.544, "grad_norm": 129.8694610595703, "kl": 0.1982421875, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 8949442.0, "reward": 0.08898493647575378, "reward_std": 0.0456019788980484, "rewards/bleu_reward_func/mean": 0.08898493647575378, "rewards/bleu_reward_func/std": 0.10032162815332413, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 510.1875, "completions/mean_terminated_length": 483.0, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.5448, "grad_norm": 16.652488708496094, "kl": 0.063751220703125, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 8968384.0, "reward": 0.04759781062602997, "reward_std": 0.009598957374691963, "rewards/bleu_reward_func/mean": 0.04759781062602997, "rewards/bleu_reward_func/std": 0.050501517951488495, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 253.88235473632812, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5456, "grad_norm": 130.1652069091797, "kl": 0.032989501953125, "learning_rate": 1e-06, "loss": -0.213, "num_tokens": 8984364.0, "reward": 0.07015404105186462, "reward_std": 0.037277355790138245, "rewards/bleu_reward_func/mean": 0.07015404105186462, "rewards/bleu_reward_func/std": 0.10696472972631454, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 221.59375, "completions/mean_terminated_length": 221.59375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.5464, "grad_norm": 15.168272018432617, "kl": 0.25604248046875, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 8993615.0, "reward": 0.05222689360380173, "reward_std": 0.015750503167510033, "rewards/bleu_reward_func/mean": 0.05222689360380173, "rewards/bleu_reward_func/std": 0.03590291365981102, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 231.0625, "completions/mean_terminated_length": 121.13043975830078, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5472, "grad_norm": 35.623695373535156, "kl": 0.1175537109375, "learning_rate": 1e-06, "loss": -0.0464, "num_tokens": 9004377.0, "reward": 0.04063406586647034, "reward_std": 0.028225397691130638, "rewards/bleu_reward_func/mean": 0.04063406586647034, "rewards/bleu_reward_func/std": 0.05525263398885727, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 361.78125, "completions/mean_terminated_length": 259.0, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.548, "grad_norm": 593.3710327148438, "kl": 0.0638427734375, "learning_rate": 1e-06, "loss": 0.1608, "num_tokens": 9020330.0, "reward": 0.1717539131641388, "reward_std": 0.0751393511891365, "rewards/bleu_reward_func/mean": 0.1717539131641388, "rewards/bleu_reward_func/std": 0.25347769260406494, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 175.59375, "completions/mean_terminated_length": 175.59375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.5488, "grad_norm": 7.423861980438232, "kl": 0.170654296875, "learning_rate": 1e-06, "loss": -0.0565, "num_tokens": 9028509.0, "reward": 0.091148242354393, "reward_std": 0.017926650121808052, "rewards/bleu_reward_func/mean": 0.091148242354393, "rewards/bleu_reward_func/std": 0.07815965265035629, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 455.15625, "completions/mean_terminated_length": 398.3125, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.5496, "grad_norm": 2.612523317337036, "kl": 0.033416748046875, "learning_rate": 1e-06, "loss": -0.0994, "num_tokens": 9046090.0, "reward": 0.04202251136302948, "reward_std": 0.015885071828961372, "rewards/bleu_reward_func/mean": 0.04202251136302948, "rewards/bleu_reward_func/std": 0.03677666559815407, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 448.9375, "completions/mean_terminated_length": 399.8888854980469, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.5504, "grad_norm": 14.944954872131348, "kl": 0.056488037109375, "learning_rate": 1e-06, "loss": 0.1249, "num_tokens": 9064360.0, "reward": 0.03943703696131706, "reward_std": 0.024654783308506012, "rewards/bleu_reward_func/mean": 0.03943703696131706, "rewards/bleu_reward_func/std": 0.03771531209349632, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 165.3125, "completions/mean_terminated_length": 85.30769348144531, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.5512, "grad_norm": 66.34259033203125, "kl": 0.408477783203125, "learning_rate": 1e-06, "loss": -0.1375, "num_tokens": 9078698.0, "reward": 0.20156420767307281, "reward_std": 0.07818345725536346, "rewards/bleu_reward_func/mean": 0.20156420767307281, "rewards/bleu_reward_func/std": 0.23512743413448334, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 170.6666717529297, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.552, "grad_norm": 45.66501998901367, "kl": 1.2672119140625, "learning_rate": 1e-06, "loss": 0.3161, "num_tokens": 9087402.0, "reward": 0.12671013176441193, "reward_std": 0.03653056174516678, "rewards/bleu_reward_func/mean": 0.12671013176441193, "rewards/bleu_reward_func/std": 0.0971146747469902, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 237.125, "completions/mean_terminated_length": 93.14286041259766, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.5528, "grad_norm": 497.3811950683594, "kl": 0.163970947265625, "learning_rate": 1e-06, "loss": 0.069, "num_tokens": 9099254.0, "reward": 0.0354698970913887, "reward_std": 0.02819395810365677, "rewards/bleu_reward_func/mean": 0.0354698970913887, "rewards/bleu_reward_func/std": 0.030991537496447563, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 287.65625, "completions/mean_terminated_length": 246.11111450195312, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.5536, "grad_norm": 6.150320053100586, "kl": 0.086273193359375, "learning_rate": 1e-06, "loss": 0.1272, "num_tokens": 9110347.0, "reward": 0.06792090833187103, "reward_std": 0.02885974571108818, "rewards/bleu_reward_func/mean": 0.06792090833187103, "rewards/bleu_reward_func/std": 0.06222621724009514, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 244.84375, "completions/mean_terminated_length": 140.30435180664062, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.5544, "grad_norm": 36.0806999206543, "kl": 0.23052978515625, "learning_rate": 1e-06, "loss": 0.0802, "num_tokens": 9121382.0, "reward": 0.09483693540096283, "reward_std": 0.05147245526313782, "rewards/bleu_reward_func/mean": 0.09483693540096283, "rewards/bleu_reward_func/std": 0.08640998601913452, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 155.0, "completions/mean_terminated_length": 131.20001220703125, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5552, "grad_norm": 55.443695068359375, "kl": 0.318450927734375, "learning_rate": 1e-06, "loss": -0.0617, "num_tokens": 9132678.0, "reward": 0.10359849035739899, "reward_std": 0.07937172800302505, "rewards/bleu_reward_func/mean": 0.10359849035739899, "rewards/bleu_reward_func/std": 0.13979652523994446, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 236.65625, "completions/mean_terminated_length": 71.45000457763672, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.556, "grad_norm": 146.36070251464844, "kl": 0.099609375, "learning_rate": 1e-06, "loss": 0.1483, "num_tokens": 9145227.0, "reward": 0.17079538106918335, "reward_std": 0.05914284288883209, "rewards/bleu_reward_func/mean": 0.17079538106918335, "rewards/bleu_reward_func/std": 0.23325958847999573, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 228.5625, "completions/mean_terminated_length": 134.08334350585938, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.5568, "grad_norm": 298.8739013671875, "kl": 0.1568603515625, "learning_rate": 1e-06, "loss": -0.1223, "num_tokens": 9160869.0, "reward": 0.037290386855602264, "reward_std": 0.014398223720490932, "rewards/bleu_reward_func/mean": 0.037290386855602264, "rewards/bleu_reward_func/std": 0.03690984100103378, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 229.3125, "completions/mean_terminated_length": 164.07693481445312, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.5576, "grad_norm": 220.50381469726562, "kl": 0.365478515625, "learning_rate": 1e-06, "loss": 0.0947, "num_tokens": 9170959.0, "reward": 0.11910027265548706, "reward_std": 0.04049726575613022, "rewards/bleu_reward_func/mean": 0.11910027265548706, "rewards/bleu_reward_func/std": 0.14281374216079712, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 216.15625, "completions/mean_terminated_length": 161.37037658691406, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5584, "grad_norm": 11.570438385009766, "kl": 0.205352783203125, "learning_rate": 1e-06, "loss": 0.0257, "num_tokens": 9181980.0, "reward": 0.08099336922168732, "reward_std": 0.04509742930531502, "rewards/bleu_reward_func/mean": 0.08099336922168732, "rewards/bleu_reward_func/std": 0.10288692265748978, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 349.96875, "completions/mean_terminated_length": 304.6000061035156, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5592, "grad_norm": 37.448883056640625, "kl": 0.09320068359375, "learning_rate": 1e-06, "loss": 0.0422, "num_tokens": 9197723.0, "reward": 0.05653442069888115, "reward_std": 0.026268266141414642, "rewards/bleu_reward_func/mean": 0.05653442069888115, "rewards/bleu_reward_func/std": 0.04277388006448746, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 255.03125, "completions/mean_terminated_length": 195.73077392578125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.56, "grad_norm": 340.22479248046875, "kl": 0.41278076171875, "learning_rate": 1e-06, "loss": -0.2392, "num_tokens": 9210148.0, "reward": 0.06149371713399887, "reward_std": 0.023687850683927536, "rewards/bleu_reward_func/mean": 0.06149371713399887, "rewards/bleu_reward_func/std": 0.03754807263612747, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 282.84375, "completions/mean_terminated_length": 145.35000610351562, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.5608, "grad_norm": 404.3658752441406, "kl": 0.29132080078125, "learning_rate": 1e-06, "loss": 0.148, "num_tokens": 9222815.0, "reward": 0.05664993077516556, "reward_std": 0.01803937554359436, "rewards/bleu_reward_func/mean": 0.05664993077516556, "rewards/bleu_reward_func/std": 0.02324024587869644, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 116.40625, "completions/mean_terminated_length": 59.892860412597656, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.5616, "grad_norm": 43.716766357421875, "kl": 0.216552734375, "learning_rate": 1e-06, "loss": -0.0484, "num_tokens": 9231460.0, "reward": 0.1364556849002838, "reward_std": 0.09380181133747101, "rewards/bleu_reward_func/mean": 0.1364556849002838, "rewards/bleu_reward_func/std": 0.21690192818641663, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 305.65625, "completions/mean_terminated_length": 211.8636474609375, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.5624, "grad_norm": 745.7041625976562, "kl": 0.14068603515625, "learning_rate": 1e-06, "loss": 0.1945, "num_tokens": 9245265.0, "reward": 0.04224216938018799, "reward_std": 0.016487902030348778, "rewards/bleu_reward_func/mean": 0.04224216938018799, "rewards/bleu_reward_func/std": 0.025008555501699448, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 115.0625, "completions/mean_terminated_length": 102.25806427001953, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.5632, "grad_norm": 216.922607421875, "kl": 0.3056640625, "learning_rate": 1e-06, "loss": 0.0737, "num_tokens": 9253899.0, "reward": 0.08648309111595154, "reward_std": 0.03777506947517395, "rewards/bleu_reward_func/mean": 0.08648309111595154, "rewards/bleu_reward_func/std": 0.05951961874961853, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 253.71875, "completions/mean_terminated_length": 98.75, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.564, "grad_norm": 162.15737915039062, "kl": 0.1361083984375, "learning_rate": 1e-06, "loss": -0.0623, "num_tokens": 9269314.0, "reward": 0.2322496771812439, "reward_std": 0.045732706785202026, "rewards/bleu_reward_func/mean": 0.2322496771812439, "rewards/bleu_reward_func/std": 0.25273510813713074, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 340.0625, "completions/mean_terminated_length": 272.7826232910156, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.5648, "grad_norm": 4.662173271179199, "kl": 0.035003662109375, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 9282188.0, "reward": 0.04237870126962662, "reward_std": 0.01780301332473755, "rewards/bleu_reward_func/mean": 0.04237870126962662, "rewards/bleu_reward_func/std": 0.04967799782752991, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 349.6875, "completions/mean_terminated_length": 275.9090881347656, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.5656, "grad_norm": 2.4642882347106934, "kl": 0.04644775390625, "learning_rate": 1e-06, "loss": 0.0446, "num_tokens": 9298362.0, "reward": 0.0549091175198555, "reward_std": 0.03525715321302414, "rewards/bleu_reward_func/mean": 0.0549091175198555, "rewards/bleu_reward_func/std": 0.051221489906311035, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 232.40000915527344, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.5664, "grad_norm": 11.45730972290039, "kl": 0.16558837890625, "learning_rate": 1e-06, "loss": 0.0823, "num_tokens": 9308934.0, "reward": 0.10365931689739227, "reward_std": 0.028398117050528526, "rewards/bleu_reward_func/mean": 0.10365931689739227, "rewards/bleu_reward_func/std": 0.06428122520446777, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 254.0625, "completions/mean_terminated_length": 99.30000305175781, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.5672, "grad_norm": 7.227019786834717, "kl": 0.154449462890625, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 9322640.0, "reward": 0.18866762518882751, "reward_std": 0.044271718710660934, "rewards/bleu_reward_func/mean": 0.18866762518882751, "rewards/bleu_reward_func/std": 0.1287185698747635, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 304.96875, "completions/mean_terminated_length": 283.5517272949219, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.568, "grad_norm": 5.722259998321533, "kl": 0.07122802734375, "learning_rate": 1e-06, "loss": -0.1128, "num_tokens": 9337639.0, "reward": 0.0695868507027626, "reward_std": 0.022387558594346046, "rewards/bleu_reward_func/mean": 0.0695868507027626, "rewards/bleu_reward_func/std": 0.065777987241745, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 150.09375, "completions/mean_terminated_length": 125.9666748046875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.5688, "grad_norm": 5.365989685058594, "kl": 0.0738525390625, "learning_rate": 1e-06, "loss": 0.3885, "num_tokens": 9346522.0, "reward": 0.2348867505788803, "reward_std": 0.09850712865591049, "rewards/bleu_reward_func/mean": 0.2348867505788803, "rewards/bleu_reward_func/std": 0.302653044462204, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 342.65625, "completions/mean_terminated_length": 265.68182373046875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5696, "grad_norm": 7.032015800476074, "kl": 0.060089111328125, "learning_rate": 1e-06, "loss": -0.0338, "num_tokens": 9359935.0, "reward": 0.053069278597831726, "reward_std": 0.015607406385242939, "rewards/bleu_reward_func/mean": 0.053069278597831726, "rewards/bleu_reward_func/std": 0.0380670465528965, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 166.46875, "completions/mean_terminated_length": 69.72000122070312, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.5704, "grad_norm": 7.543166160583496, "kl": 0.26068115234375, "learning_rate": 1e-06, "loss": 0.3241, "num_tokens": 9367886.0, "reward": 0.17086198925971985, "reward_std": 0.059704020619392395, "rewards/bleu_reward_func/mean": 0.17086198925971985, "rewards/bleu_reward_func/std": 0.13924144208431244, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 306.3125, "completions/mean_terminated_length": 248.72000122070312, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.5712, "grad_norm": 3.74664568901062, "kl": 0.086090087890625, "learning_rate": 1e-06, "loss": -0.2145, "num_tokens": 9381256.0, "reward": 0.026702141389250755, "reward_std": 0.010159555822610855, "rewards/bleu_reward_func/mean": 0.026702141389250755, "rewards/bleu_reward_func/std": 0.025492098182439804, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 202.40625, "completions/mean_terminated_length": 145.07408142089844, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.572, "grad_norm": 3.670616865158081, "kl": 0.0810394287109375, "learning_rate": 1e-06, "loss": 0.3116, "num_tokens": 9394869.0, "reward": 0.09095513820648193, "reward_std": 0.0500517264008522, "rewards/bleu_reward_func/mean": 0.09095513820648193, "rewards/bleu_reward_func/std": 0.07213761657476425, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 221.5625, "completions/mean_terminated_length": 202.20001220703125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.5728, "grad_norm": 9.221464157104492, "kl": 0.10400390625, "learning_rate": 1e-06, "loss": 0.3058, "num_tokens": 9405335.0, "reward": 0.12374541163444519, "reward_std": 0.040764160454273224, "rewards/bleu_reward_func/mean": 0.12374541163444519, "rewards/bleu_reward_func/std": 0.13386160135269165, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 347.1875, "completions/mean_terminated_length": 234.42105102539062, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.5736, "grad_norm": 12.991151809692383, "kl": 0.157958984375, "learning_rate": 1e-06, "loss": 0.0424, "num_tokens": 9420789.0, "reward": 0.07029742747545242, "reward_std": 0.012854170054197311, "rewards/bleu_reward_func/mean": 0.07029742747545242, "rewards/bleu_reward_func/std": 0.041719451546669006, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 503.75, "completions/mean_terminated_length": 474.2857360839844, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "epoch": 0.5744, "grad_norm": 2.107853889465332, "kl": 0.028350830078125, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 9439789.0, "reward": 0.05256051570177078, "reward_std": 0.010154004208743572, "rewards/bleu_reward_func/mean": 0.05256051570177078, "rewards/bleu_reward_func/std": 0.03522626310586929, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 244.15625, "completions/mean_terminated_length": 235.51612854003906, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.5752, "grad_norm": 3.158909559249878, "kl": 0.0923919677734375, "learning_rate": 1e-06, "loss": -0.1019, "num_tokens": 9451978.0, "reward": 0.12841522693634033, "reward_std": 0.05657704174518585, "rewards/bleu_reward_func/mean": 0.12841522693634033, "rewards/bleu_reward_func/std": 0.07523242384195328, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 150.4375, "completions/mean_terminated_length": 83.48148345947266, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.576, "grad_norm": 8.099937438964844, "kl": 0.214874267578125, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 9461736.0, "reward": 0.08868992328643799, "reward_std": 0.017071515321731567, "rewards/bleu_reward_func/mean": 0.08868992328643799, "rewards/bleu_reward_func/std": 0.08577441424131393, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 447.65625, "completions/mean_terminated_length": 383.3125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.5768, "grad_norm": 2.4854142665863037, "kl": 0.045318603515625, "learning_rate": 1e-06, "loss": -0.0504, "num_tokens": 9477381.0, "reward": 0.0682106539607048, "reward_std": 0.022257793694734573, "rewards/bleu_reward_func/mean": 0.0682106539607048, "rewards/bleu_reward_func/std": 0.05095710977911949, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 167.84375, "completions/mean_terminated_length": 156.74192810058594, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.5776, "grad_norm": 4.308876037597656, "kl": 0.07659912109375, "learning_rate": 1e-06, "loss": 0.1774, "num_tokens": 9489008.0, "reward": 0.0951995924115181, "reward_std": 0.033833228051662445, "rewards/bleu_reward_func/mean": 0.0951995924115181, "rewards/bleu_reward_func/std": 0.0729941874742508, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 179.34375, "completions/mean_terminated_length": 144.9310302734375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.5784, "grad_norm": 12.132597923278809, "kl": 0.33624267578125, "learning_rate": 1e-06, "loss": 0.2153, "num_tokens": 9498059.0, "reward": 0.10619839280843735, "reward_std": 0.04761648178100586, "rewards/bleu_reward_func/mean": 0.10619839280843735, "rewards/bleu_reward_func/std": 0.0809776559472084, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 374.78125, "completions/mean_terminated_length": 268.0555725097656, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5792, "grad_norm": 7.476963996887207, "kl": 0.1087646484375, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 9517972.0, "reward": 0.17816489934921265, "reward_std": 0.016055870801210403, "rewards/bleu_reward_func/mean": 0.17816489934921265, "rewards/bleu_reward_func/std": 0.266427606344223, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 152.7692413330078, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.58, "grad_norm": 5.399383544921875, "kl": 0.13983154296875, "learning_rate": 1e-06, "loss": 0.1212, "num_tokens": 9530760.0, "reward": 0.07149016857147217, "reward_std": 0.023819390684366226, "rewards/bleu_reward_func/mean": 0.07149016857147217, "rewards/bleu_reward_func/std": 0.053011875599622726, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 500.3125, "completions/mean_terminated_length": 418.5, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.5808, "grad_norm": 2.243359088897705, "kl": 0.028045654296875, "learning_rate": 1e-06, "loss": -0.0175, "num_tokens": 9550330.0, "reward": 0.04440176486968994, "reward_std": 0.00922885537147522, "rewards/bleu_reward_func/mean": 0.04440176486968994, "rewards/bleu_reward_func/std": 0.03932040557265282, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 154.65625, "completions/mean_terminated_length": 103.60714721679688, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5816, "grad_norm": 54.60096740722656, "kl": 0.27252197265625, "learning_rate": 1e-06, "loss": 0.1714, "num_tokens": 9557863.0, "reward": 0.34987902641296387, "reward_std": 0.09637948125600815, "rewards/bleu_reward_func/mean": 0.34987902641296387, "rewards/bleu_reward_func/std": 0.30998000502586365, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 394.21875, "completions/mean_terminated_length": 197.9166717529297, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.5824, "grad_norm": 3.5557267665863037, "kl": 0.04107666015625, "learning_rate": 1e-06, "loss": -0.2035, "num_tokens": 9576342.0, "reward": 0.03708350285887718, "reward_std": 0.013400746509432793, "rewards/bleu_reward_func/mean": 0.03708350285887718, "rewards/bleu_reward_func/std": 0.030460968613624573, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 287.53125, "completions/mean_terminated_length": 272.5666809082031, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.5832, "grad_norm": 5.337757110595703, "kl": 0.106048583984375, "learning_rate": 1e-06, "loss": -0.0648, "num_tokens": 9587719.0, "reward": 0.08226186782121658, "reward_std": 0.016267672181129456, "rewards/bleu_reward_func/mean": 0.08226186782121658, "rewards/bleu_reward_func/std": 0.047058336436748505, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 353.46875, "completions/mean_terminated_length": 291.4347839355469, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.584, "grad_norm": 172.38458251953125, "kl": 0.142059326171875, "learning_rate": 1e-06, "loss": -0.0203, "num_tokens": 9604270.0, "reward": 0.08777523040771484, "reward_std": 0.028989041224122047, "rewards/bleu_reward_func/mean": 0.08777523040771484, "rewards/bleu_reward_func/std": 0.053535155951976776, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 123.5625, "completions/mean_terminated_length": 123.5625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.5848, "grad_norm": 14.68234920501709, "kl": 0.1837158203125, "learning_rate": 1e-06, "loss": 0.053, "num_tokens": 9613376.0, "reward": 0.09781108796596527, "reward_std": 0.03509049117565155, "rewards/bleu_reward_func/mean": 0.09781108796596527, "rewards/bleu_reward_func/std": 0.07531887292861938, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 147.53125, "completions/mean_terminated_length": 63.42308044433594, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.5856, "grad_norm": 471.59912109375, "kl": 0.191680908203125, "learning_rate": 1e-06, "loss": 0.0896, "num_tokens": 9624305.0, "reward": 0.08778894692659378, "reward_std": 0.025603748857975006, "rewards/bleu_reward_func/mean": 0.08778894692659378, "rewards/bleu_reward_func/std": 0.06823020428419113, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 254.0, "completions/mean_terminated_length": 194.4615478515625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.5864, "grad_norm": 3.397786855697632, "kl": 0.03460693359375, "learning_rate": 1e-06, "loss": -0.0596, "num_tokens": 9634593.0, "reward": 0.08798034489154816, "reward_std": 0.02149152383208275, "rewards/bleu_reward_func/mean": 0.08798034489154816, "rewards/bleu_reward_func/std": 0.07060196995735168, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 300.75, "completions/mean_terminated_length": 261.629638671875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.5872, "grad_norm": 11.675055503845215, "kl": 0.0538330078125, "learning_rate": 1e-06, "loss": 0.1186, "num_tokens": 9647729.0, "reward": 0.20255795121192932, "reward_std": 0.044919952750205994, "rewards/bleu_reward_func/mean": 0.20255795121192932, "rewards/bleu_reward_func/std": 0.23513151705265045, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 100.875, "completions/mean_terminated_length": 100.875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.588, "grad_norm": 7.969452381134033, "kl": 0.2791748046875, "learning_rate": 1e-06, "loss": 0.1783, "num_tokens": 9655565.0, "reward": 0.1732563078403473, "reward_std": 0.06255275756120682, "rewards/bleu_reward_func/mean": 0.1732563078403473, "rewards/bleu_reward_func/std": 0.14761896431446075, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 255.78125, "completions/mean_terminated_length": 196.6538543701172, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.5888, "grad_norm": 37.51567840576172, "kl": 0.139739990234375, "learning_rate": 1e-06, "loss": 0.0247, "num_tokens": 9672070.0, "reward": 0.3068300187587738, "reward_std": 0.018469596281647682, "rewards/bleu_reward_func/mean": 0.3068300187587738, "rewards/bleu_reward_func/std": 0.29021966457366943, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 367.15625, "completions/mean_terminated_length": 301.31817626953125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5896, "grad_norm": 2.4598867893218994, "kl": 0.0330657958984375, "learning_rate": 1e-06, "loss": -0.0194, "num_tokens": 9688219.0, "reward": 0.05701170861721039, "reward_std": 0.020281650125980377, "rewards/bleu_reward_func/mean": 0.05701170861721039, "rewards/bleu_reward_func/std": 0.055385053157806396, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 318.46875, "completions/mean_terminated_length": 230.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.5904, "grad_norm": 4.623442649841309, "kl": 0.045806884765625, "learning_rate": 1e-06, "loss": -0.2136, "num_tokens": 9701082.0, "reward": 0.05631488561630249, "reward_std": 0.022235814481973648, "rewards/bleu_reward_func/mean": 0.05631488561630249, "rewards/bleu_reward_func/std": 0.0748782679438591, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 195.40625, "completions/mean_terminated_length": 122.34616088867188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.5912, "grad_norm": 6.426390171051025, "kl": 0.21075439453125, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 9712175.0, "reward": 0.250314861536026, "reward_std": 0.043683700263500214, "rewards/bleu_reward_func/mean": 0.250314861536026, "rewards/bleu_reward_func/std": 0.27451202273368835, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 314.375, "completions/mean_terminated_length": 195.8000030517578, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.592, "grad_norm": 4.511030197143555, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0376, "num_tokens": 9727491.0, "reward": 0.24225017428398132, "reward_std": 0.0391615591943264, "rewards/bleu_reward_func/mean": 0.24225017428398132, "rewards/bleu_reward_func/std": 0.23075063526630402, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 291.34375, "completions/mean_terminated_length": 191.0454559326172, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.5928, "grad_norm": 5.867874622344971, "kl": 0.070037841796875, "learning_rate": 1e-06, "loss": 0.0656, "num_tokens": 9740590.0, "reward": 0.08597154170274734, "reward_std": 0.053836189210414886, "rewards/bleu_reward_func/mean": 0.08597154170274734, "rewards/bleu_reward_func/std": 0.12926995754241943, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 205.34375, "completions/mean_terminated_length": 148.55555725097656, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.5936, "grad_norm": 6.713605880737305, "kl": 0.12249755859375, "learning_rate": 1e-06, "loss": 0.1915, "num_tokens": 9751033.0, "reward": 0.19744500517845154, "reward_std": 0.041014768183231354, "rewards/bleu_reward_func/mean": 0.19744500517845154, "rewards/bleu_reward_func/std": 0.1538456529378891, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 249.46875, "completions/mean_terminated_length": 200.8518524169922, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.5944, "grad_norm": 5.988234043121338, "kl": 0.22430419921875, "learning_rate": 1e-06, "loss": 0.0382, "num_tokens": 9764368.0, "reward": 0.2869833707809448, "reward_std": 0.07026369869709015, "rewards/bleu_reward_func/mean": 0.2869833707809448, "rewards/bleu_reward_func/std": 0.2287815362215042, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 265.3125, "completions/mean_terminated_length": 196.239990234375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.5952, "grad_norm": 8.931143760681152, "kl": 0.234375, "learning_rate": 1e-06, "loss": -0.1984, "num_tokens": 9774882.0, "reward": 0.07221800833940506, "reward_std": 0.031748898327350616, "rewards/bleu_reward_func/mean": 0.07221800833940506, "rewards/bleu_reward_func/std": 0.06019110977649689, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 410.3125, "completions/mean_terminated_length": 279.5714416503906, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.596, "grad_norm": 2.886622190475464, "kl": 0.031707763671875, "learning_rate": 1e-06, "loss": -0.0903, "num_tokens": 9790372.0, "reward": 0.030976204201579094, "reward_std": 0.015047797001898289, "rewards/bleu_reward_func/mean": 0.030976204201579094, "rewards/bleu_reward_func/std": 0.033486902713775635, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 241.84375, "completions/mean_terminated_length": 166.1999969482422, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.5968, "grad_norm": 4.601830005645752, "kl": 0.069915771484375, "learning_rate": 1e-06, "loss": 0.0366, "num_tokens": 9801439.0, "reward": 0.10595028847455978, "reward_std": 0.024186890572309494, "rewards/bleu_reward_func/mean": 0.10595028847455978, "rewards/bleu_reward_func/std": 0.10705985873937607, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 311.53125, "completions/mean_terminated_length": 255.39999389648438, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.5976, "grad_norm": 2.84419322013855, "kl": 0.05322265625, "learning_rate": 1e-06, "loss": -0.0815, "num_tokens": 9816096.0, "reward": 0.02818489633500576, "reward_std": 0.008401873521506786, "rewards/bleu_reward_func/mean": 0.02818489633500576, "rewards/bleu_reward_func/std": 0.02303573302924633, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 100.78125, "completions/mean_terminated_length": 100.78125, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.5984, "grad_norm": 8.280403137207031, "kl": 0.14306640625, "learning_rate": 1e-06, "loss": 0.0722, "num_tokens": 9821761.0, "reward": 0.07066097855567932, "reward_std": 0.026687482371926308, "rewards/bleu_reward_func/mean": 0.07066097855567932, "rewards/bleu_reward_func/std": 0.04903886467218399, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 129.1875, "completions/mean_terminated_length": 129.1875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.5992, "grad_norm": 6.650774955749512, "kl": 0.1456298828125, "learning_rate": 1e-06, "loss": 0.1458, "num_tokens": 9829943.0, "reward": 0.19511400163173676, "reward_std": 0.03503159433603287, "rewards/bleu_reward_func/mean": 0.19511400163173676, "rewards/bleu_reward_func/std": 0.21101784706115723, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 258.1875, "completions/mean_terminated_length": 105.9000015258789, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.6, "grad_norm": 5.664450645446777, "kl": 0.081695556640625, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 9841813.0, "reward": 0.20738312602043152, "reward_std": 0.07332950830459595, "rewards/bleu_reward_func/mean": 0.20738312602043152, "rewards/bleu_reward_func/std": 0.2197185456752777, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 160.03125, "completions/mean_terminated_length": 136.56668090820312, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.6008, "grad_norm": 69.39850616455078, "kl": 0.24407958984375, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 9852910.0, "reward": 0.2434358447790146, "reward_std": 0.10366295278072357, "rewards/bleu_reward_func/mean": 0.2434358447790146, "rewards/bleu_reward_func/std": 0.18655826151371002, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 316.875, "completions/mean_terminated_length": 251.83334350585938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.6016, "grad_norm": 4.908503532409668, "kl": 0.0460205078125, "learning_rate": 1e-06, "loss": 0.1812, "num_tokens": 9865266.0, "reward": 0.05490465834736824, "reward_std": 0.02047915570437908, "rewards/bleu_reward_func/mean": 0.05490465834736824, "rewards/bleu_reward_func/std": 0.04037528112530708, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 269.09375, "completions/mean_terminated_length": 252.90000915527344, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6024, "grad_norm": 2.893157958984375, "kl": 0.048187255859375, "learning_rate": 1e-06, "loss": -0.0419, "num_tokens": 9876613.0, "reward": 0.05797014757990837, "reward_std": 0.029720589518547058, "rewards/bleu_reward_func/mean": 0.05797014757990837, "rewards/bleu_reward_func/std": 0.07483170926570892, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 434.21875, "completions/mean_terminated_length": 285.727294921875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6032, "grad_norm": 3.0334763526916504, "kl": 0.042449951171875, "learning_rate": 1e-06, "loss": -0.167, "num_tokens": 9894044.0, "reward": 0.0762338861823082, "reward_std": 0.02360478974878788, "rewards/bleu_reward_func/mean": 0.0762338861823082, "rewards/bleu_reward_func/std": 0.0673457533121109, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 423.21875, "completions/mean_terminated_length": 309.0714416503906, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.604, "grad_norm": 2.2641522884368896, "kl": 0.03399658203125, "learning_rate": 1e-06, "loss": -0.0917, "num_tokens": 9911107.0, "reward": 0.03881996497511864, "reward_std": 0.012424922548234463, "rewards/bleu_reward_func/mean": 0.03881996497511864, "rewards/bleu_reward_func/std": 0.02825937233865261, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 196.53125, "completions/mean_terminated_length": 108.19999694824219, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.6048, "grad_norm": 9.265530586242676, "kl": 0.2891845703125, "learning_rate": 1e-06, "loss": 0.1189, "num_tokens": 9924868.0, "reward": 0.2168477475643158, "reward_std": 0.07323689758777618, "rewards/bleu_reward_func/mean": 0.2168477475643158, "rewards/bleu_reward_func/std": 0.17768503725528717, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 163.53125, "completions/mean_terminated_length": 152.29031372070312, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.6056, "grad_norm": 44.89513397216797, "kl": 0.31927490234375, "learning_rate": 1e-06, "loss": 0.0321, "num_tokens": 9935765.0, "reward": 0.08691335469484329, "reward_std": 0.03311008960008621, "rewards/bleu_reward_func/mean": 0.08691335469484329, "rewards/bleu_reward_func/std": 0.08314234763383865, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 97.375, "completions/mean_terminated_length": 97.375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6064, "grad_norm": 8.500920295715332, "kl": 0.15838623046875, "learning_rate": 1e-06, "loss": 0.1392, "num_tokens": 9944529.0, "reward": 0.25747808814048767, "reward_std": 0.048998236656188965, "rewards/bleu_reward_func/mean": 0.25747808814048767, "rewards/bleu_reward_func/std": 0.22997993230819702, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 357.78125, "completions/mean_terminated_length": 265.25, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.6072, "grad_norm": 3.4296295642852783, "kl": 0.05389404296875, "learning_rate": 1e-06, "loss": -0.0162, "num_tokens": 9961058.0, "reward": 0.07074315845966339, "reward_std": 0.0662379041314125, "rewards/bleu_reward_func/mean": 0.07074315845966339, "rewards/bleu_reward_func/std": 0.1079547107219696, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 175.86207580566406, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.608, "grad_norm": 6.224173545837402, "kl": 0.1185302734375, "learning_rate": 1e-06, "loss": 0.0424, "num_tokens": 9972646.0, "reward": 0.09982403367757797, "reward_std": 0.06204840913414955, "rewards/bleu_reward_func/mean": 0.09982403367757797, "rewards/bleu_reward_func/std": 0.10973682999610901, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 152.8125, "completions/mean_terminated_length": 128.86666870117188, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.6088, "grad_norm": 12.398276329040527, "kl": 0.240966796875, "learning_rate": 1e-06, "loss": 0.198, "num_tokens": 9985136.0, "reward": 0.059197958558797836, "reward_std": 0.02872345596551895, "rewards/bleu_reward_func/mean": 0.059197958558797836, "rewards/bleu_reward_func/std": 0.04119112715125084, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 377.6875, "completions/mean_terminated_length": 273.22222900390625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.6096, "grad_norm": 3.9670443534851074, "kl": 0.046600341796875, "learning_rate": 1e-06, "loss": 0.0648, "num_tokens": 10003078.0, "reward": 0.0413321927189827, "reward_std": 0.015110660344362259, "rewards/bleu_reward_func/mean": 0.0413321927189827, "rewards/bleu_reward_func/std": 0.032528944313526154, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 472.0, "completions/mean_terminated_length": 413.5384826660156, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.6104, "grad_norm": 2.092273712158203, "kl": 0.0268096923828125, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 10020230.0, "reward": 0.08549900352954865, "reward_std": 0.03175706788897514, "rewards/bleu_reward_func/mean": 0.08549900352954865, "rewards/bleu_reward_func/std": 0.042792484164237976, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 150.6875, "completions/mean_terminated_length": 83.77777862548828, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6112, "grad_norm": 9.28490924835205, "kl": 0.265869140625, "learning_rate": 1e-06, "loss": 0.2918, "num_tokens": 10031172.0, "reward": 0.21292155981063843, "reward_std": 0.06925603747367859, "rewards/bleu_reward_func/mean": 0.21292155981063843, "rewards/bleu_reward_func/std": 0.18994402885437012, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 328.3125, "completions/mean_terminated_length": 218.10000610351562, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.612, "grad_norm": 21.738191604614258, "kl": 0.1495361328125, "learning_rate": 1e-06, "loss": -0.0895, "num_tokens": 10046358.0, "reward": 0.09622834622859955, "reward_std": 0.04176661744713783, "rewards/bleu_reward_func/mean": 0.09622834622859955, "rewards/bleu_reward_func/std": 0.09296616911888123, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 288.21875, "completions/mean_terminated_length": 246.7777862548828, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.6128, "grad_norm": 5.178511142730713, "kl": 0.1473388671875, "learning_rate": 1e-06, "loss": 0.0688, "num_tokens": 10059877.0, "reward": 0.088385209441185, "reward_std": 0.016962474212050438, "rewards/bleu_reward_func/mean": 0.088385209441185, "rewards/bleu_reward_func/std": 0.08985943347215652, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 311.46875, "completions/mean_terminated_length": 244.625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.6136, "grad_norm": 11.883882522583008, "kl": 0.2354583740234375, "learning_rate": 1e-06, "loss": -0.0238, "num_tokens": 10073076.0, "reward": 0.18446165323257446, "reward_std": 0.10309243947267532, "rewards/bleu_reward_func/mean": 0.18446165323257446, "rewards/bleu_reward_func/std": 0.30338525772094727, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 275.59375, "completions/mean_terminated_length": 168.13636779785156, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.6144, "grad_norm": 5.438653469085693, "kl": 0.081451416015625, "learning_rate": 1e-06, "loss": 0.0639, "num_tokens": 10085871.0, "reward": 0.06602545082569122, "reward_std": 0.030349329113960266, "rewards/bleu_reward_func/mean": 0.06602545082569122, "rewards/bleu_reward_func/std": 0.04767395555973053, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 136.09375, "completions/mean_terminated_length": 136.09375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6152, "grad_norm": 5.674292087554932, "kl": 0.26715087890625, "learning_rate": 1e-06, "loss": 0.2437, "num_tokens": 10094858.0, "reward": 0.07597756385803223, "reward_std": 0.027629435062408447, "rewards/bleu_reward_func/mean": 0.07597756385803223, "rewards/bleu_reward_func/std": 0.054181892424821854, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 107.9375, "completions/mean_terminated_length": 94.9032211303711, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.616, "grad_norm": 9.938776969909668, "kl": 0.16571044921875, "learning_rate": 1e-06, "loss": 0.082, "num_tokens": 10106672.0, "reward": 0.3735446035861969, "reward_std": 0.03898521885275841, "rewards/bleu_reward_func/mean": 0.3735446035861969, "rewards/bleu_reward_func/std": 0.30306297540664673, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 338.6875, "completions/mean_terminated_length": 280.91668701171875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.6168, "grad_norm": 3.914602041244507, "kl": 0.05010986328125, "learning_rate": 1e-06, "loss": -0.1112, "num_tokens": 10120510.0, "reward": 0.09634806215763092, "reward_std": 0.04157658666372299, "rewards/bleu_reward_func/mean": 0.09634806215763092, "rewards/bleu_reward_func/std": 0.08702099323272705, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 215.23077392578125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6176, "grad_norm": 5.461522102355957, "kl": 0.3319091796875, "learning_rate": 1e-06, "loss": 0.0344, "num_tokens": 10132138.0, "reward": 0.18050828576087952, "reward_std": 0.033299222588539124, "rewards/bleu_reward_func/mean": 0.18050828576087952, "rewards/bleu_reward_func/std": 0.21068614721298218, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 131.375, "completions/mean_terminated_length": 131.375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6184, "grad_norm": 8.162845611572266, "kl": 0.2459716796875, "learning_rate": 1e-06, "loss": 0.1064, "num_tokens": 10142558.0, "reward": 0.1030242070555687, "reward_std": 0.05847536772489548, "rewards/bleu_reward_func/mean": 0.1030242070555687, "rewards/bleu_reward_func/std": 0.14961844682693481, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 169.9375, "completions/mean_terminated_length": 169.9375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.6192, "grad_norm": 43.1834602355957, "kl": 0.22735595703125, "learning_rate": 1e-06, "loss": -0.0649, "num_tokens": 10151012.0, "reward": 0.04478512331843376, "reward_std": 0.012456279247999191, "rewards/bleu_reward_func/mean": 0.04478512331843376, "rewards/bleu_reward_func/std": 0.04301442950963974, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 249.3125, "completions/mean_terminated_length": 188.69232177734375, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.62, "grad_norm": 8.241740226745605, "kl": 0.188079833984375, "learning_rate": 1e-06, "loss": 0.1066, "num_tokens": 10163446.0, "reward": 0.15149368345737457, "reward_std": 0.028546612709760666, "rewards/bleu_reward_func/mean": 0.15149368345737457, "rewards/bleu_reward_func/std": 0.14032159745693207, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 235.125, "completions/mean_terminated_length": 126.78260803222656, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.6208, "grad_norm": 8.033968925476074, "kl": 0.22894287109375, "learning_rate": 1e-06, "loss": 0.098, "num_tokens": 10174522.0, "reward": 0.22483249008655548, "reward_std": 0.04489654302597046, "rewards/bleu_reward_func/mean": 0.22483249008655548, "rewards/bleu_reward_func/std": 0.24184906482696533, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 273.0625, "completions/mean_terminated_length": 179.56521606445312, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.6216, "grad_norm": 6.315563201904297, "kl": 0.09613037109375, "learning_rate": 1e-06, "loss": 0.1547, "num_tokens": 10188964.0, "reward": 0.06417440623044968, "reward_std": 0.01652311347424984, "rewards/bleu_reward_func/mean": 0.06417440623044968, "rewards/bleu_reward_func/std": 0.05222758278250694, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 350.40625, "completions/mean_terminated_length": 188.8125, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.6224, "grad_norm": 4.945059776306152, "kl": 0.08636474609375, "learning_rate": 1e-06, "loss": 0.0638, "num_tokens": 10203657.0, "reward": 0.07400526106357574, "reward_std": 0.03621644526720047, "rewards/bleu_reward_func/mean": 0.07400526106357574, "rewards/bleu_reward_func/std": 0.05740804970264435, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 319.1875, "completions/mean_terminated_length": 243.7391357421875, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.6232, "grad_norm": 4.364378929138184, "kl": 0.07342529296875, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 10219919.0, "reward": 0.24302205443382263, "reward_std": 0.05576051399111748, "rewards/bleu_reward_func/mean": 0.24302205443382263, "rewards/bleu_reward_func/std": 0.21575042605400085, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 201.8181915283203, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.624, "grad_norm": 3.959169864654541, "kl": 0.06689453125, "learning_rate": 1e-06, "loss": 0.0405, "num_tokens": 10234119.0, "reward": 0.09677430242300034, "reward_std": 0.02671782858669758, "rewards/bleu_reward_func/mean": 0.09677430242300034, "rewards/bleu_reward_func/std": 0.06890682131052017, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 211.40625, "completions/mean_terminated_length": 168.46429443359375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.6248, "grad_norm": 10.254522323608398, "kl": 0.30865478515625, "learning_rate": 1e-06, "loss": -0.044, "num_tokens": 10247388.0, "reward": 0.2194344401359558, "reward_std": 0.04920031875371933, "rewards/bleu_reward_func/mean": 0.2194344401359558, "rewards/bleu_reward_func/std": 0.15552020072937012, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 201.46875, "completions/mean_terminated_length": 180.7666778564453, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.6256, "grad_norm": 7.234709739685059, "kl": 0.1651611328125, "learning_rate": 1e-06, "loss": 0.2297, "num_tokens": 10256947.0, "reward": 0.11007180064916611, "reward_std": 0.07193183898925781, "rewards/bleu_reward_func/mean": 0.11007180064916611, "rewards/bleu_reward_func/std": 0.13098347187042236, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 288.15625, "completions/mean_terminated_length": 246.70370483398438, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.6264, "grad_norm": 3.6756701469421387, "kl": 0.050689697265625, "learning_rate": 1e-06, "loss": -0.2297, "num_tokens": 10271504.0, "reward": 0.07084184139966965, "reward_std": 0.03263479843735695, "rewards/bleu_reward_func/mean": 0.07084184139966965, "rewards/bleu_reward_func/std": 0.07953313738107681, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 127.375, "completions/mean_terminated_length": 114.96773529052734, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6272, "grad_norm": 7.3130879402160645, "kl": 0.143096923828125, "learning_rate": 1e-06, "loss": -0.1119, "num_tokens": 10281236.0, "reward": 0.17116650938987732, "reward_std": 0.040961284190416336, "rewards/bleu_reward_func/mean": 0.17116650938987732, "rewards/bleu_reward_func/std": 0.16110415756702423, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 140.0, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.628, "grad_norm": 5.703468322753906, "kl": 0.168975830078125, "learning_rate": 1e-06, "loss": 0.0673, "num_tokens": 10293288.0, "reward": 0.14155232906341553, "reward_std": 0.059418316930532455, "rewards/bleu_reward_func/mean": 0.14155232906341553, "rewards/bleu_reward_func/std": 0.142944797873497, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 266.4615478515625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.6288, "grad_norm": 2.4137492179870605, "kl": 0.023956298828125, "learning_rate": 1e-06, "loss": 0.0405, "num_tokens": 10309744.0, "reward": 0.16134724020957947, "reward_std": 0.01978662982583046, "rewards/bleu_reward_func/mean": 0.16134724020957947, "rewards/bleu_reward_func/std": 0.16176313161849976, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 281.34375, "completions/mean_terminated_length": 228.11538696289062, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.6296, "grad_norm": 5.432137489318848, "kl": 0.126953125, "learning_rate": 1e-06, "loss": -0.0834, "num_tokens": 10322867.0, "reward": 0.13262051343917847, "reward_std": 0.054468683898448944, "rewards/bleu_reward_func/mean": 0.13262051343917847, "rewards/bleu_reward_func/std": 0.1454581618309021, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 157.8125, "completions/mean_terminated_length": 121.17241668701172, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.6304, "grad_norm": 8.820817947387695, "kl": 0.384033203125, "learning_rate": 1e-06, "loss": 0.1057, "num_tokens": 10330085.0, "reward": 0.14398705959320068, "reward_std": 0.05267474800348282, "rewards/bleu_reward_func/mean": 0.14398705959320068, "rewards/bleu_reward_func/std": 0.12204661965370178, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 313.875, "completions/mean_terminated_length": 223.8181915283203, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.6312, "grad_norm": 6.252136707305908, "kl": 0.202178955078125, "learning_rate": 1e-06, "loss": -0.0958, "num_tokens": 10344937.0, "reward": 0.08566081523895264, "reward_std": 0.0418044775724411, "rewards/bleu_reward_func/mean": 0.08566081523895264, "rewards/bleu_reward_func/std": 0.1277945637702942, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 476.40625, "completions/mean_terminated_length": 408.4545593261719, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.632, "grad_norm": 2.1677629947662354, "kl": 0.03436279296875, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 10362222.0, "reward": 0.04695405811071396, "reward_std": 0.013839447870850563, "rewards/bleu_reward_func/mean": 0.04695405811071396, "rewards/bleu_reward_func/std": 0.03280064836144447, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 404.46875, "completions/mean_terminated_length": 296.9375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.6328, "grad_norm": 2.3538496494293213, "kl": 0.0289459228515625, "learning_rate": 1e-06, "loss": -0.0297, "num_tokens": 10382845.0, "reward": 0.08459493517875671, "reward_std": 0.029446884989738464, "rewards/bleu_reward_func/mean": 0.08459493517875671, "rewards/bleu_reward_func/std": 0.051741067320108414, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 133.0625, "completions/mean_terminated_length": 107.80000305175781, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6336, "grad_norm": 6.885672092437744, "kl": 0.20733642578125, "learning_rate": 1e-06, "loss": 0.1098, "num_tokens": 10391199.0, "reward": 0.10581733286380768, "reward_std": 0.034825149923563004, "rewards/bleu_reward_func/mean": 0.10581733286380768, "rewards/bleu_reward_func/std": 0.10278832167387009, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 154.4375, "completions/mean_terminated_length": 35.25, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.6344, "grad_norm": 8.279248237609863, "kl": 0.2750244140625, "learning_rate": 1e-06, "loss": -0.0444, "num_tokens": 10399789.0, "reward": 0.1634266972541809, "reward_std": 0.029335156083106995, "rewards/bleu_reward_func/mean": 0.1634266972541809, "rewards/bleu_reward_func/std": 0.1743723601102829, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 244.96875, "completions/mean_terminated_length": 123.59091186523438, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6352, "grad_norm": 5.577760219573975, "kl": 0.230865478515625, "learning_rate": 1e-06, "loss": -0.0138, "num_tokens": 10413116.0, "reward": 0.18318259716033936, "reward_std": 0.02782328985631466, "rewards/bleu_reward_func/mean": 0.18318259716033936, "rewards/bleu_reward_func/std": 0.14704957604408264, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 354.78125, "completions/mean_terminated_length": 344.3000183105469, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.636, "grad_norm": 2.591658115386963, "kl": 0.0296630859375, "learning_rate": 1e-06, "loss": -0.0201, "num_tokens": 10426797.0, "reward": 0.06094507500529289, "reward_std": 0.02977069467306137, "rewards/bleu_reward_func/mean": 0.06094507500529289, "rewards/bleu_reward_func/std": 0.03347548097372055, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 189.06668090820312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6368, "grad_norm": 7.372705936431885, "kl": 0.226715087890625, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 10435821.0, "reward": 0.17854920029640198, "reward_std": 0.039038486778736115, "rewards/bleu_reward_func/mean": 0.17854920029640198, "rewards/bleu_reward_func/std": 0.11250942945480347, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 139.90625, "completions/mean_terminated_length": 139.90625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6376, "grad_norm": 7.399951457977295, "kl": 0.2593994140625, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 10444706.0, "reward": 0.20415213704109192, "reward_std": 0.05372469127178192, "rewards/bleu_reward_func/mean": 0.20415213704109192, "rewards/bleu_reward_func/std": 0.15420135855674744, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 253.3125, "completions/mean_terminated_length": 193.61538696289062, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6384, "grad_norm": 5.452202320098877, "kl": 0.207672119140625, "learning_rate": 1e-06, "loss": 0.0658, "num_tokens": 10457300.0, "reward": 0.18789556622505188, "reward_std": 0.06054109334945679, "rewards/bleu_reward_func/mean": 0.18789556622505188, "rewards/bleu_reward_func/std": 0.18226853013038635, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 164.53125, "completions/mean_terminated_length": 100.18518829345703, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6392, "grad_norm": 9.581258773803711, "kl": 0.312286376953125, "learning_rate": 1e-06, "loss": -0.0913, "num_tokens": 10465053.0, "reward": 0.14276297390460968, "reward_std": 0.028537599369883537, "rewards/bleu_reward_func/mean": 0.14276297390460968, "rewards/bleu_reward_func/std": 0.10928227007389069, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 56.59375, "completions/mean_terminated_length": 56.59375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.64, "grad_norm": 7.995264053344727, "kl": 0.2779541015625, "learning_rate": 1e-06, "loss": 0.3759, "num_tokens": 10477608.0, "reward": 0.34325188398361206, "reward_std": 0.07241753488779068, "rewards/bleu_reward_func/mean": 0.34325188398361206, "rewards/bleu_reward_func/std": 0.20597775280475616, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 191.5625, "completions/mean_terminated_length": 117.61538696289062, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.6408, "grad_norm": 4.229004859924316, "kl": 0.0904541015625, "learning_rate": 1e-06, "loss": 0.1193, "num_tokens": 10485490.0, "reward": 0.11370354145765305, "reward_std": 0.061382561922073364, "rewards/bleu_reward_func/mean": 0.11370354145765305, "rewards/bleu_reward_func/std": 0.15154796838760376, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 99.76470947265625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.6416, "grad_norm": 4.936343193054199, "kl": 0.227630615234375, "learning_rate": 1e-06, "loss": 0.0723, "num_tokens": 10497930.0, "reward": 0.15342603623867035, "reward_std": 0.018828846514225006, "rewards/bleu_reward_func/mean": 0.15342603623867035, "rewards/bleu_reward_func/std": 0.22573818266391754, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 206.9375, "completions/mean_terminated_length": 163.35714721679688, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.6424, "grad_norm": 4.5400471687316895, "kl": 0.1302490234375, "learning_rate": 1e-06, "loss": -0.0265, "num_tokens": 10509096.0, "reward": 0.042201556265354156, "reward_std": 0.01641710475087166, "rewards/bleu_reward_func/mean": 0.042201556265354156, "rewards/bleu_reward_func/std": 0.026252396404743195, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 339.8125, "completions/mean_terminated_length": 222.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.6432, "grad_norm": 4.132330417633057, "kl": 0.042144775390625, "learning_rate": 1e-06, "loss": -0.0638, "num_tokens": 10522810.0, "reward": 0.05155924707651138, "reward_std": 0.017338326200842857, "rewards/bleu_reward_func/mean": 0.05155924707651138, "rewards/bleu_reward_func/std": 0.03961692750453949, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 251.0, "completions/mean_terminated_length": 132.3636474609375, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.644, "grad_norm": 6.286128044128418, "kl": 0.150299072265625, "learning_rate": 1e-06, "loss": 0.1961, "num_tokens": 10533090.0, "reward": 0.03828435763716698, "reward_std": 0.01768323965370655, "rewards/bleu_reward_func/mean": 0.03828435763716698, "rewards/bleu_reward_func/std": 0.035699598491191864, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 286.03125, "completions/mean_terminated_length": 253.75001525878906, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.6448, "grad_norm": 3.333425283432007, "kl": 0.0326385498046875, "learning_rate": 1e-06, "loss": 0.0689, "num_tokens": 10544131.0, "reward": 0.11853313446044922, "reward_std": 0.06690388172864914, "rewards/bleu_reward_func/mean": 0.11853313446044922, "rewards/bleu_reward_func/std": 0.14521227777004242, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 176.6875, "completions/mean_terminated_length": 165.87095642089844, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6456, "grad_norm": 6.8076090812683105, "kl": 0.24957275390625, "learning_rate": 1e-06, "loss": -0.1496, "num_tokens": 10554641.0, "reward": 0.12172487378120422, "reward_std": 0.05724428966641426, "rewards/bleu_reward_func/mean": 0.12172487378120422, "rewards/bleu_reward_func/std": 0.11496427655220032, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 62.615386962890625, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.6464, "grad_norm": 9.55725383758545, "kl": 0.30224609375, "learning_rate": 1e-06, "loss": 0.0709, "num_tokens": 10563829.0, "reward": 0.20068883895874023, "reward_std": 0.06663694977760315, "rewards/bleu_reward_func/mean": 0.20068883895874023, "rewards/bleu_reward_func/std": 0.13896267116069794, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 141.34375, "completions/mean_terminated_length": 37.55999755859375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.6472, "grad_norm": 26.54884147644043, "kl": 0.41717529296875, "learning_rate": 1e-06, "loss": -0.1795, "num_tokens": 10574624.0, "reward": 0.22808396816253662, "reward_std": 0.06877206265926361, "rewards/bleu_reward_func/mean": 0.22808396816253662, "rewards/bleu_reward_func/std": 0.21049334108829498, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 111.21875, "completions/mean_terminated_length": 111.21875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.648, "grad_norm": 13.591133117675781, "kl": 0.272674560546875, "learning_rate": 1e-06, "loss": -0.0292, "num_tokens": 10583671.0, "reward": 0.2966269850730896, "reward_std": 0.015265233814716339, "rewards/bleu_reward_func/mean": 0.2966269850730896, "rewards/bleu_reward_func/std": 0.24745707213878632, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 94.34375, "completions/mean_terminated_length": 80.87096405029297, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6488, "grad_norm": 10.449114799499512, "kl": 0.2784423828125, "learning_rate": 1e-06, "loss": -0.2645, "num_tokens": 10592586.0, "reward": 0.23048871755599976, "reward_std": 0.05683053284883499, "rewards/bleu_reward_func/mean": 0.23048871755599976, "rewards/bleu_reward_func/std": 0.304109662771225, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 184.65625, "completions/mean_terminated_length": 75.54167175292969, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.6496, "grad_norm": 6.2868170738220215, "kl": 0.202392578125, "learning_rate": 1e-06, "loss": 0.0856, "num_tokens": 10604447.0, "reward": 0.06996900588274002, "reward_std": 0.01753135770559311, "rewards/bleu_reward_func/mean": 0.06996900588274002, "rewards/bleu_reward_func/std": 0.07089151442050934, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 46.96875, "completions/mean_terminated_length": 46.96875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.6504, "grad_norm": 15.99052619934082, "kl": 0.55419921875, "learning_rate": 1e-06, "loss": 0.1537, "num_tokens": 10612710.0, "reward": 0.08658448606729507, "reward_std": 0.03601383790373802, "rewards/bleu_reward_func/mean": 0.08658448606729507, "rewards/bleu_reward_func/std": 0.05530841648578644, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 355.0, "completions/mean_terminated_length": 283.6363830566406, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.6512, "grad_norm": 3.1796348094940186, "kl": 0.059417724609375, "learning_rate": 1e-06, "loss": 0.1311, "num_tokens": 10625326.0, "reward": 0.11449694633483887, "reward_std": 0.027395280078053474, "rewards/bleu_reward_func/mean": 0.11449694633483887, "rewards/bleu_reward_func/std": 0.05288613215088844, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 128.96875, "completions/mean_terminated_length": 103.43334197998047, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.652, "grad_norm": 6.432667255401611, "kl": 0.1822509765625, "learning_rate": 1e-06, "loss": 0.1721, "num_tokens": 10635421.0, "reward": 0.18185263872146606, "reward_std": 0.0783199891448021, "rewards/bleu_reward_func/mean": 0.18185263872146606, "rewards/bleu_reward_func/std": 0.18959355354309082, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 242.9375, "completions/mean_terminated_length": 204.50001525878906, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.6528, "grad_norm": 4.343508720397949, "kl": 0.06640625, "learning_rate": 1e-06, "loss": 0.1592, "num_tokens": 10647283.0, "reward": 0.10118309408426285, "reward_std": 0.026538610458374023, "rewards/bleu_reward_func/mean": 0.10118309408426285, "rewards/bleu_reward_func/std": 0.08866976201534271, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 217.5625, "completions/mean_terminated_length": 175.50001525878906, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6536, "grad_norm": 4.102277755737305, "kl": 0.141204833984375, "learning_rate": 1e-06, "loss": 0.2738, "num_tokens": 10660189.0, "reward": 0.23801954090595245, "reward_std": 0.07484984397888184, "rewards/bleu_reward_func/mean": 0.23801954090595245, "rewards/bleu_reward_func/std": 0.168580561876297, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 180.625, "completions/mean_terminated_length": 169.93548583984375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.6544, "grad_norm": 5.659482002258301, "kl": 0.08270263671875, "learning_rate": 1e-06, "loss": 0.1325, "num_tokens": 10668393.0, "reward": 0.08136270940303802, "reward_std": 0.02622528187930584, "rewards/bleu_reward_func/mean": 0.08136270940303802, "rewards/bleu_reward_func/std": 0.03544744476675987, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 289.375, "completions/mean_terminated_length": 227.0399932861328, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.6552, "grad_norm": 3.059807300567627, "kl": 0.076446533203125, "learning_rate": 1e-06, "loss": -0.016, "num_tokens": 10682749.0, "reward": 0.07544586062431335, "reward_std": 0.0220788661390543, "rewards/bleu_reward_func/mean": 0.07544586062431335, "rewards/bleu_reward_func/std": 0.04309820756316185, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 271.4375, "completions/mean_terminated_length": 191.25, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.656, "grad_norm": 4.126210689544678, "kl": 0.06744384765625, "learning_rate": 1e-06, "loss": 0.2091, "num_tokens": 10693011.0, "reward": 0.12100762873888016, "reward_std": 0.040202461183071136, "rewards/bleu_reward_func/mean": 0.12100762873888016, "rewards/bleu_reward_func/std": 0.09315716475248337, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 360.34375, "completions/mean_terminated_length": 226.5294189453125, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.6568, "grad_norm": 3.169628620147705, "kl": 0.030731201171875, "learning_rate": 1e-06, "loss": -0.1334, "num_tokens": 10706846.0, "reward": 0.041019197553396225, "reward_std": 0.012767975218594074, "rewards/bleu_reward_func/mean": 0.041019197553396225, "rewards/bleu_reward_func/std": 0.050586286932229996, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 264.46875, "completions/mean_terminated_length": 46.05882263183594, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.6576, "grad_norm": 6.150150775909424, "kl": 0.174896240234375, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 10720189.0, "reward": 0.10611159354448318, "reward_std": 0.044405680149793625, "rewards/bleu_reward_func/mean": 0.10611159354448318, "rewards/bleu_reward_func/std": 0.10892455279827118, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 266.84375, "completions/mean_terminated_length": 210.2692413330078, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6584, "grad_norm": 6.753479957580566, "kl": 0.20147705078125, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 10731600.0, "reward": 0.10170187056064606, "reward_std": 0.03044716641306877, "rewards/bleu_reward_func/mean": 0.10170187056064606, "rewards/bleu_reward_func/std": 0.05836126208305359, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 392.28125, "completions/mean_terminated_length": 299.1666564941406, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.6592, "grad_norm": 2.5884182453155518, "kl": 0.0380859375, "learning_rate": 1e-06, "loss": -0.0465, "num_tokens": 10748121.0, "reward": 0.03844967484474182, "reward_std": 0.012901275418698788, "rewards/bleu_reward_func/mean": 0.03844967484474182, "rewards/bleu_reward_func/std": 0.032823171466588974, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 214.7692413330078, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.66, "grad_norm": 4.168792247772217, "kl": 0.083251953125, "learning_rate": 1e-06, "loss": -0.0346, "num_tokens": 10763353.0, "reward": 0.08619528263807297, "reward_std": 0.02499576285481453, "rewards/bleu_reward_func/mean": 0.08619528263807297, "rewards/bleu_reward_func/std": 0.10102304071187973, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 264.78125, "completions/mean_terminated_length": 195.55999755859375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.6608, "grad_norm": 10.431943893432617, "kl": 0.154876708984375, "learning_rate": 1e-06, "loss": 0.1006, "num_tokens": 10776642.0, "reward": 0.06286956369876862, "reward_std": 0.027797410264611244, "rewards/bleu_reward_func/mean": 0.06286956369876862, "rewards/bleu_reward_func/std": 0.07537111639976501, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 231.1875, "completions/mean_terminated_length": 202.13792419433594, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6616, "grad_norm": 7.486932754516602, "kl": 0.15045166015625, "learning_rate": 1e-06, "loss": 0.2166, "num_tokens": 10788656.0, "reward": 0.10479411482810974, "reward_std": 0.029287472367286682, "rewards/bleu_reward_func/mean": 0.10479411482810974, "rewards/bleu_reward_func/std": 0.07098822295665741, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 165.3125, "completions/mean_terminated_length": 49.75, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6624, "grad_norm": 7.1387104988098145, "kl": 0.23382568359375, "learning_rate": 1e-06, "loss": 0.0727, "num_tokens": 10795930.0, "reward": 0.11342652887105942, "reward_std": 0.027198534458875656, "rewards/bleu_reward_func/mean": 0.11342652887105942, "rewards/bleu_reward_func/std": 0.11971734464168549, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 108.96875, "completions/mean_terminated_length": 108.96875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6632, "grad_norm": 8.27657413482666, "kl": 0.36053466796875, "learning_rate": 1e-06, "loss": -0.0108, "num_tokens": 10804241.0, "reward": 0.28853511810302734, "reward_std": 0.07218953967094421, "rewards/bleu_reward_func/mean": 0.28853511810302734, "rewards/bleu_reward_func/std": 0.20515379309654236, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 51.875, "completions/mean_terminated_length": 51.875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.664, "grad_norm": 8.845324516296387, "kl": 0.4793701171875, "learning_rate": 1e-06, "loss": 0.1651, "num_tokens": 10813957.0, "reward": 0.2881876826286316, "reward_std": 0.06279260665178299, "rewards/bleu_reward_func/mean": 0.2881876826286316, "rewards/bleu_reward_func/std": 0.23817574977874756, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 268.46875, "completions/mean_terminated_length": 157.77273559570312, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.6648, "grad_norm": 5.753482818603516, "kl": 0.143280029296875, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 10826356.0, "reward": 0.13365639746189117, "reward_std": 0.023316586390137672, "rewards/bleu_reward_func/mean": 0.13365639746189117, "rewards/bleu_reward_func/std": 0.20613247156143188, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 194.125, "completions/mean_terminated_length": 172.933349609375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6656, "grad_norm": 5.661513805389404, "kl": 0.106048583984375, "learning_rate": 1e-06, "loss": 0.0722, "num_tokens": 10836128.0, "reward": 0.08124659210443497, "reward_std": 0.016117524355649948, "rewards/bleu_reward_func/mean": 0.08124659210443497, "rewards/bleu_reward_func/std": 0.08725257217884064, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 26.625, "completions/mean_terminated_length": 26.625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6664, "grad_norm": 12.99825382232666, "kl": 0.453857421875, "learning_rate": 1e-06, "loss": 0.1057, "num_tokens": 10842116.0, "reward": 0.3153986930847168, "reward_std": 0.0658825933933258, "rewards/bleu_reward_func/mean": 0.3153986930847168, "rewards/bleu_reward_func/std": 0.17146961390972137, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 185.5625, "completions/mean_terminated_length": 94.15999603271484, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.6672, "grad_norm": 9.406866073608398, "kl": 0.144378662109375, "learning_rate": 1e-06, "loss": 0.2779, "num_tokens": 10852030.0, "reward": 0.13975293934345245, "reward_std": 0.04399016499519348, "rewards/bleu_reward_func/mean": 0.13975293934345245, "rewards/bleu_reward_func/std": 0.17490676045417786, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 355.125, "completions/mean_terminated_length": 198.25, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.668, "grad_norm": 8.956555366516113, "kl": 0.220977783203125, "learning_rate": 1e-06, "loss": -0.047, "num_tokens": 10869930.0, "reward": 0.04670516401529312, "reward_std": 0.01496485248208046, "rewards/bleu_reward_func/mean": 0.04670516401529312, "rewards/bleu_reward_func/std": 0.03720833733677864, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 370.71875, "completions/mean_terminated_length": 331.1600036621094, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.6688, "grad_norm": 2.452960729598999, "kl": 0.048583984375, "learning_rate": 1e-06, "loss": -0.0511, "num_tokens": 10884969.0, "reward": 0.05606111139059067, "reward_std": 0.01513909362256527, "rewards/bleu_reward_func/mean": 0.05606111139059067, "rewards/bleu_reward_func/std": 0.05016703903675079, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 200.40625, "completions/mean_terminated_length": 96.54167175292969, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.6696, "grad_norm": 4.966017723083496, "kl": 0.11663818359375, "learning_rate": 1e-06, "loss": 0.2335, "num_tokens": 10897982.0, "reward": 0.0694584771990776, "reward_std": 0.04167729243636131, "rewards/bleu_reward_func/mean": 0.0694584771990776, "rewards/bleu_reward_func/std": 0.06985452026128769, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 126.00000762939453, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.6704, "grad_norm": 6.741494178771973, "kl": 0.11309814453125, "learning_rate": 1e-06, "loss": -0.1029, "num_tokens": 10911978.0, "reward": 0.2410159707069397, "reward_std": 0.056731171905994415, "rewards/bleu_reward_func/mean": 0.2410159707069397, "rewards/bleu_reward_func/std": 0.20536428689956665, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 198.8125, "completions/mean_terminated_length": 140.8148193359375, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.6712, "grad_norm": 5.741243839263916, "kl": 0.2135009765625, "learning_rate": 1e-06, "loss": 0.1006, "num_tokens": 10925892.0, "reward": 0.2018284797668457, "reward_std": 0.04848968982696533, "rewards/bleu_reward_func/mean": 0.2018284797668457, "rewards/bleu_reward_func/std": 0.19715876877307892, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.672, "grad_norm": 6.41207218170166, "kl": 0.09161376953125, "learning_rate": 1e-06, "loss": -0.0922, "num_tokens": 10934300.0, "reward": 0.0522538498044014, "reward_std": 0.021779239177703857, "rewards/bleu_reward_func/mean": 0.0522538498044014, "rewards/bleu_reward_func/std": 0.02408943697810173, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 389.8125, "completions/mean_terminated_length": 316.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.6728, "grad_norm": 2.996119976043701, "kl": 0.06011962890625, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 10951166.0, "reward": 0.08128196746110916, "reward_std": 0.01865270733833313, "rewards/bleu_reward_func/mean": 0.08128196746110916, "rewards/bleu_reward_func/std": 0.05130209028720856, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 180.13792419433594, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.6736, "grad_norm": 4.3674750328063965, "kl": 0.07989501953125, "learning_rate": 1e-06, "loss": 0.2159, "num_tokens": 10962038.0, "reward": 0.035381607711315155, "reward_std": 0.015435540117323399, "rewards/bleu_reward_func/mean": 0.035381607711315155, "rewards/bleu_reward_func/std": 0.02227640338242054, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 326.15625, "completions/mean_terminated_length": 241.68182373046875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.6744, "grad_norm": 7.144293308258057, "kl": 0.1136932373046875, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 10980947.0, "reward": 0.24271616339683533, "reward_std": 0.03907809406518936, "rewards/bleu_reward_func/mean": 0.24271616339683533, "rewards/bleu_reward_func/std": 0.21944448351860046, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 318.6875, "completions/mean_terminated_length": 274.0769348144531, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.6752, "grad_norm": 3.7767539024353027, "kl": 0.080078125, "learning_rate": 1e-06, "loss": -0.0971, "num_tokens": 10993977.0, "reward": 0.034242644906044006, "reward_std": 0.01977381855249405, "rewards/bleu_reward_func/mean": 0.034242644906044006, "rewards/bleu_reward_func/std": 0.024919696152210236, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 73.55555725097656, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.676, "grad_norm": 8.71445083618164, "kl": 0.34588623046875, "learning_rate": 1e-06, "loss": 0.0431, "num_tokens": 11007565.0, "reward": 0.23305484652519226, "reward_std": 0.05401034653186798, "rewards/bleu_reward_func/mean": 0.23305484652519226, "rewards/bleu_reward_func/std": 0.2091369926929474, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 363.375, "completions/mean_terminated_length": 305.2174072265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.6768, "grad_norm": 5.406923770904541, "kl": 0.061279296875, "learning_rate": 1e-06, "loss": -0.1012, "num_tokens": 11023473.0, "reward": 0.15702804923057556, "reward_std": 0.02012755163013935, "rewards/bleu_reward_func/mean": 0.15702804923057556, "rewards/bleu_reward_func/std": 0.16683605313301086, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 84.125, "completions/mean_terminated_length": 70.32257843017578, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6776, "grad_norm": 7.79757022857666, "kl": 0.094970703125, "learning_rate": 1e-06, "loss": 0.1681, "num_tokens": 11033277.0, "reward": 0.10883745551109314, "reward_std": 0.06399966031312943, "rewards/bleu_reward_func/mean": 0.10883745551109314, "rewards/bleu_reward_func/std": 0.11935968697071075, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 239.8518524169922, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6784, "grad_norm": 7.137828350067139, "kl": 0.18231201171875, "learning_rate": 1e-06, "loss": -0.1134, "num_tokens": 11047977.0, "reward": 0.07731978595256805, "reward_std": 0.026035165414214134, "rewards/bleu_reward_func/mean": 0.07731978595256805, "rewards/bleu_reward_func/std": 0.08138881623744965, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 180.21875, "completions/mean_terminated_length": 118.77777862548828, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.6792, "grad_norm": 10.036446571350098, "kl": 0.2659912109375, "learning_rate": 1e-06, "loss": 0.1133, "num_tokens": 11058320.0, "reward": 0.13975116610527039, "reward_std": 0.02090391516685486, "rewards/bleu_reward_func/mean": 0.13975116610527039, "rewards/bleu_reward_func/std": 0.15142837166786194, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 369.6875, "completions/mean_terminated_length": 284.3000183105469, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.68, "grad_norm": 3.8194141387939453, "kl": 0.038970947265625, "learning_rate": 1e-06, "loss": 0.0524, "num_tokens": 11073662.0, "reward": 0.08185985684394836, "reward_std": 0.033635906875133514, "rewards/bleu_reward_func/mean": 0.08185985684394836, "rewards/bleu_reward_func/std": 0.06655923277139664, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 106.5, "completions/mean_terminated_length": 31.407407760620117, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.6808, "grad_norm": 6.564956188201904, "kl": 0.1971435546875, "learning_rate": 1e-06, "loss": 0.1071, "num_tokens": 11085022.0, "reward": 0.22551283240318298, "reward_std": 0.04716075211763382, "rewards/bleu_reward_func/mean": 0.22551283240318298, "rewards/bleu_reward_func/std": 0.16660061478614807, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 130.53334045410156, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.6816, "grad_norm": 15.921568870544434, "kl": 0.28778076171875, "learning_rate": 1e-06, "loss": -0.0597, "num_tokens": 11095378.0, "reward": 0.1252121478319168, "reward_std": 0.05527370423078537, "rewards/bleu_reward_func/mean": 0.1252121478319168, "rewards/bleu_reward_func/std": 0.12923383712768555, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 305.9375, "completions/mean_terminated_length": 212.27273559570312, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.6824, "grad_norm": 8.1792631149292, "kl": 0.1992645263671875, "learning_rate": 1e-06, "loss": 0.0667, "num_tokens": 11109616.0, "reward": 0.0659586489200592, "reward_std": 0.027510065585374832, "rewards/bleu_reward_func/mean": 0.0659586489200592, "rewards/bleu_reward_func/std": 0.08249466121196747, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 224.2105255126953, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.6832, "grad_norm": 2.265425443649292, "kl": 0.039276123046875, "learning_rate": 1e-06, "loss": 0.2352, "num_tokens": 11123036.0, "reward": 0.10829215496778488, "reward_std": 0.10021056979894638, "rewards/bleu_reward_func/mean": 0.10829215496778488, "rewards/bleu_reward_func/std": 0.15350966155529022, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 398.5625, "completions/mean_terminated_length": 285.125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.684, "grad_norm": 2.628075361251831, "kl": 0.047515869140625, "learning_rate": 1e-06, "loss": 0.1379, "num_tokens": 11139950.0, "reward": 0.04525969177484512, "reward_std": 0.025323685258626938, "rewards/bleu_reward_func/mean": 0.04525969177484512, "rewards/bleu_reward_func/std": 0.04984954744577408, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 512.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 489.0625, "completions/mean_terminated_length": 389.66668701171875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.6848, "grad_norm": 2.320620059967041, "kl": 0.035888671875, "learning_rate": 1e-06, "loss": -0.0345, "num_tokens": 11158672.0, "reward": 0.016138827428221703, "reward_std": 0.0038068746216595173, "rewards/bleu_reward_func/mean": 0.016138827428221703, "rewards/bleu_reward_func/std": 0.016928784549236298, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 317.84375, "completions/mean_terminated_length": 241.86956787109375, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.6856, "grad_norm": 3.551910638809204, "kl": 0.109039306640625, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 11173291.0, "reward": 0.20694701373577118, "reward_std": 0.014496378600597382, "rewards/bleu_reward_func/mean": 0.20694701373577118, "rewards/bleu_reward_func/std": 0.2963625490665436, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 147.125, "completions/mean_terminated_length": 62.92308044433594, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.6864, "grad_norm": 15.02341079711914, "kl": 0.70330810546875, "learning_rate": 1e-06, "loss": 0.2886, "num_tokens": 11182215.0, "reward": 0.19951725006103516, "reward_std": 0.052443791180849075, "rewards/bleu_reward_func/mean": 0.19951725006103516, "rewards/bleu_reward_func/std": 0.19433696568012238, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 128.2105255126953, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6872, "grad_norm": 8.070085525512695, "kl": 0.110443115234375, "learning_rate": 1e-06, "loss": 0.1827, "num_tokens": 11194667.0, "reward": 0.04423338174819946, "reward_std": 0.017294086515903473, "rewards/bleu_reward_func/mean": 0.04423338174819946, "rewards/bleu_reward_func/std": 0.047055598348379135, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 329.71875, "completions/mean_terminated_length": 287.65386962890625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.688, "grad_norm": 2.8106324672698975, "kl": 0.039154052734375, "learning_rate": 1e-06, "loss": -0.2058, "num_tokens": 11207642.0, "reward": 0.06786108016967773, "reward_std": 0.0352618470788002, "rewards/bleu_reward_func/mean": 0.06786108016967773, "rewards/bleu_reward_func/std": 0.04090343415737152, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 368.53125, "completions/mean_terminated_length": 320.7083435058594, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.6888, "grad_norm": 2.837979793548584, "kl": 0.052001953125, "learning_rate": 1e-06, "loss": 0.107, "num_tokens": 11221547.0, "reward": 0.07621696591377258, "reward_std": 0.029543904587626457, "rewards/bleu_reward_func/mean": 0.07621696591377258, "rewards/bleu_reward_func/std": 0.04072652757167816, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 398.59375, "completions/mean_terminated_length": 321.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6896, "grad_norm": 2.941206693649292, "kl": 0.046844482421875, "learning_rate": 1e-06, "loss": -0.0139, "num_tokens": 11237182.0, "reward": 0.026391834020614624, "reward_std": 0.016949903219938278, "rewards/bleu_reward_func/mean": 0.026391834020614624, "rewards/bleu_reward_func/std": 0.03409172222018242, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 357.40625, "completions/mean_terminated_length": 251.63157653808594, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.6904, "grad_norm": 2.633358955383301, "kl": 0.055328369140625, "learning_rate": 1e-06, "loss": -0.0911, "num_tokens": 11251547.0, "reward": 0.042053550481796265, "reward_std": 0.021867552772164345, "rewards/bleu_reward_func/mean": 0.042053550481796265, "rewards/bleu_reward_func/std": 0.029616717249155045, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 168.3125, "completions/mean_terminated_length": 119.21429443359375, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.6912, "grad_norm": 4.309210777282715, "kl": 0.13665771484375, "learning_rate": 1e-06, "loss": -0.1275, "num_tokens": 11261245.0, "reward": 0.1768188774585724, "reward_std": 0.030298635363578796, "rewards/bleu_reward_func/mean": 0.1768188774585724, "rewards/bleu_reward_func/std": 0.12399855256080627, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 252.78125, "completions/mean_terminated_length": 151.3478240966797, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.692, "grad_norm": 8.154788970947266, "kl": 0.23638916015625, "learning_rate": 1e-06, "loss": 0.0438, "num_tokens": 11275478.0, "reward": 0.07366465032100677, "reward_std": 0.029438909143209457, "rewards/bleu_reward_func/mean": 0.07366465032100677, "rewards/bleu_reward_func/std": 0.05699191242456436, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 322.21875, "completions/mean_terminated_length": 235.95455932617188, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6928, "grad_norm": 2.933178663253784, "kl": 0.04754638671875, "learning_rate": 1e-06, "loss": 0.0639, "num_tokens": 11287317.0, "reward": 0.041873324662446976, "reward_std": 0.02685678005218506, "rewards/bleu_reward_func/mean": 0.041873324662446976, "rewards/bleu_reward_func/std": 0.039241958409547806, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 208.1875, "completions/mean_terminated_length": 89.30435180664062, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.6936, "grad_norm": 6.312671661376953, "kl": 0.1842041015625, "learning_rate": 1e-06, "loss": 0.1617, "num_tokens": 11297835.0, "reward": 0.103369802236557, "reward_std": 0.04473632201552391, "rewards/bleu_reward_func/mean": 0.103369802236557, "rewards/bleu_reward_func/std": 0.10830661654472351, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 230.37037658691406, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.6944, "grad_norm": 3.08828067779541, "kl": 0.090667724609375, "learning_rate": 1e-06, "loss": -0.0115, "num_tokens": 11312311.0, "reward": 0.16378189623355865, "reward_std": 0.0222244244068861, "rewards/bleu_reward_func/mean": 0.16378189623355865, "rewards/bleu_reward_func/std": 0.19553562998771667, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 216.15625, "completions/mean_terminated_length": 117.54167175292969, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.6952, "grad_norm": 5.62147331237793, "kl": 0.15509033203125, "learning_rate": 1e-06, "loss": -0.0272, "num_tokens": 11325348.0, "reward": 0.059518001973629, "reward_std": 0.028110869228839874, "rewards/bleu_reward_func/mean": 0.059518001973629, "rewards/bleu_reward_func/std": 0.048489734530448914, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 404.46875, "completions/mean_terminated_length": 296.9375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.696, "grad_norm": 3.3346071243286133, "kl": 0.038360595703125, "learning_rate": 1e-06, "loss": -0.0205, "num_tokens": 11343635.0, "reward": 0.07933641970157623, "reward_std": 0.021958988159894943, "rewards/bleu_reward_func/mean": 0.07933641970157623, "rewards/bleu_reward_func/std": 0.06096653267741203, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 236.6875, "completions/mean_terminated_length": 185.70370483398438, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.6968, "grad_norm": 4.446465015411377, "kl": 0.17974853515625, "learning_rate": 1e-06, "loss": -0.0515, "num_tokens": 11353817.0, "reward": 0.11160654574632645, "reward_std": 0.039265286177396774, "rewards/bleu_reward_func/mean": 0.11160654574632645, "rewards/bleu_reward_func/std": 0.08857923746109009, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 161.56521606445312, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.6976, "grad_norm": 7.414572715759277, "kl": 0.107696533203125, "learning_rate": 1e-06, "loss": -0.0123, "num_tokens": 11367725.0, "reward": 0.1780683994293213, "reward_std": 0.015433109365403652, "rewards/bleu_reward_func/mean": 0.1780683994293213, "rewards/bleu_reward_func/std": 0.2229662984609604, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 408.0625, "completions/mean_terminated_length": 360.8182067871094, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.6984, "grad_norm": 3.227613925933838, "kl": 0.0611572265625, "learning_rate": 1e-06, "loss": -0.0681, "num_tokens": 11385423.0, "reward": 0.0366949737071991, "reward_std": 0.01884927786886692, "rewards/bleu_reward_func/mean": 0.0366949737071991, "rewards/bleu_reward_func/std": 0.028229771181941032, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 222.21875, "completions/mean_terminated_length": 168.55555725097656, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.6992, "grad_norm": 7.29306697845459, "kl": 0.142730712890625, "learning_rate": 1e-06, "loss": 0.11, "num_tokens": 11397022.0, "reward": 0.046667180955410004, "reward_std": 0.020207617431879044, "rewards/bleu_reward_func/mean": 0.046667180955410004, "rewards/bleu_reward_func/std": 0.02555895410478115, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 173.21739196777344, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.7, "grad_norm": 3.604617118835449, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": -0.3505, "num_tokens": 11410046.0, "reward": 0.07897455990314484, "reward_std": 0.014880911447107792, "rewards/bleu_reward_func/mean": 0.07897455990314484, "rewards/bleu_reward_func/std": 0.08343996107578278, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 325.46875, "completions/mean_terminated_length": 227.76190185546875, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.7008, "grad_norm": 8.727375030517578, "kl": 0.133270263671875, "learning_rate": 1e-06, "loss": 0.3117, "num_tokens": 11422725.0, "reward": 0.07061035186052322, "reward_std": 0.0419192910194397, "rewards/bleu_reward_func/mean": 0.07061035186052322, "rewards/bleu_reward_func/std": 0.07667659968137741, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 181.375, "completions/mean_terminated_length": 120.14814758300781, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7016, "grad_norm": 7.317707061767578, "kl": 0.21905517578125, "learning_rate": 1e-06, "loss": 0.4234, "num_tokens": 11431937.0, "reward": 0.10765747725963593, "reward_std": 0.052248626947402954, "rewards/bleu_reward_func/mean": 0.10765747725963593, "rewards/bleu_reward_func/std": 0.05436404421925545, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 211.35482788085938, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7024, "grad_norm": 4.363486289978027, "kl": 0.214019775390625, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 11443057.0, "reward": 0.30547034740448, "reward_std": 0.024015674367547035, "rewards/bleu_reward_func/mean": 0.30547034740448, "rewards/bleu_reward_func/std": 0.2281493991613388, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 205.46875, "completions/mean_terminated_length": 119.63999938964844, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.7032, "grad_norm": 8.484012603759766, "kl": 0.189361572265625, "learning_rate": 1e-06, "loss": 0.1579, "num_tokens": 11451232.0, "reward": 0.0824245885014534, "reward_std": 0.04487679526209831, "rewards/bleu_reward_func/mean": 0.0824245885014534, "rewards/bleu_reward_func/std": 0.07150331139564514, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 141.375, "completions/mean_terminated_length": 129.4193572998047, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.704, "grad_norm": 5.958530902862549, "kl": 0.1085205078125, "learning_rate": 1e-06, "loss": -0.0313, "num_tokens": 11462604.0, "reward": 0.04005417972803116, "reward_std": 0.024934137240052223, "rewards/bleu_reward_func/mean": 0.04005417972803116, "rewards/bleu_reward_func/std": 0.03826345130801201, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 283.03125, "completions/mean_terminated_length": 126.36842346191406, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7048, "grad_norm": 6.075742244720459, "kl": 0.3231201171875, "learning_rate": 1e-06, "loss": 0.0444, "num_tokens": 11477461.0, "reward": 0.10366402566432953, "reward_std": 0.055370062589645386, "rewards/bleu_reward_func/mean": 0.10366402566432953, "rewards/bleu_reward_func/std": 0.11003145575523376, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 361.0, "completions/mean_terminated_length": 227.76470947265625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7056, "grad_norm": 5.96100378036499, "kl": 0.19482421875, "learning_rate": 1e-06, "loss": -0.0353, "num_tokens": 11493365.0, "reward": 0.1235186904668808, "reward_std": 0.038026995956897736, "rewards/bleu_reward_func/mean": 0.1235186904668808, "rewards/bleu_reward_func/std": 0.05816841870546341, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 172.09375, "completions/mean_terminated_length": 109.14814758300781, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.7064, "grad_norm": 6.117469787597656, "kl": 0.201904296875, "learning_rate": 1e-06, "loss": 0.067, "num_tokens": 11501456.0, "reward": 0.15373189747333527, "reward_std": 0.05197744071483612, "rewards/bleu_reward_func/mean": 0.15373189747333527, "rewards/bleu_reward_func/std": 0.10633216798305511, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 263.15625, "completions/mean_terminated_length": 193.47999572753906, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7072, "grad_norm": 4.437005519866943, "kl": 0.12554931640625, "learning_rate": 1e-06, "loss": 0.2681, "num_tokens": 11517437.0, "reward": 0.29476526379585266, "reward_std": 0.13803553581237793, "rewards/bleu_reward_func/mean": 0.29476526379585266, "rewards/bleu_reward_func/std": 0.32065168023109436, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 252.75, "completions/mean_terminated_length": 134.90908813476562, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.708, "grad_norm": 8.795902252197266, "kl": 0.3934326171875, "learning_rate": 1e-06, "loss": -0.0341, "num_tokens": 11529885.0, "reward": 0.13262969255447388, "reward_std": 0.037800293415784836, "rewards/bleu_reward_func/mean": 0.13262969255447388, "rewards/bleu_reward_func/std": 0.11564164608716965, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 355.6875, "completions/mean_terminated_length": 199.375, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.7088, "grad_norm": 5.155429840087891, "kl": 0.0558319091796875, "learning_rate": 1e-06, "loss": 0.0544, "num_tokens": 11546155.0, "reward": 0.10861489176750183, "reward_std": 0.035860445350408554, "rewards/bleu_reward_func/mean": 0.10861489176750183, "rewards/bleu_reward_func/std": 0.08613201975822449, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 282.84375, "completions/mean_terminated_length": 126.0526351928711, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.7096, "grad_norm": 4.782761096954346, "kl": 0.133056640625, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 11561390.0, "reward": 0.0671025738120079, "reward_std": 0.018492672592401505, "rewards/bleu_reward_func/mean": 0.0671025738120079, "rewards/bleu_reward_func/std": 0.06450604647397995, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 331.6875, "completions/mean_terminated_length": 261.13043212890625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.7104, "grad_norm": 2.8767964839935303, "kl": 0.06182861328125, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 11577356.0, "reward": 0.1093081682920456, "reward_std": 0.07805053889751434, "rewards/bleu_reward_func/mean": 0.1093081682920456, "rewards/bleu_reward_func/std": 0.17048169672489166, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 397.375, "completions/mean_terminated_length": 359.16668701171875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.7112, "grad_norm": 2.2424142360687256, "kl": 0.0458984375, "learning_rate": 1e-06, "loss": -0.015, "num_tokens": 11591432.0, "reward": 0.06777183711528778, "reward_std": 0.019787484779953957, "rewards/bleu_reward_func/mean": 0.06777183711528778, "rewards/bleu_reward_func/std": 0.041765324771404266, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 292.59375, "completions/mean_terminated_length": 160.9499969482422, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.712, "grad_norm": 4.390679359436035, "kl": 0.0987548828125, "learning_rate": 1e-06, "loss": 0.2246, "num_tokens": 11603515.0, "reward": 0.06538625806570053, "reward_std": 0.03718053176999092, "rewards/bleu_reward_func/mean": 0.06538625806570053, "rewards/bleu_reward_func/std": 0.0816822499036789, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 155.44827270507812, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.7128, "grad_norm": 5.5804290771484375, "kl": 0.123260498046875, "learning_rate": 1e-06, "loss": -0.2822, "num_tokens": 11612927.0, "reward": 0.0781329870223999, "reward_std": 0.049637503921985626, "rewards/bleu_reward_func/mean": 0.0781329870223999, "rewards/bleu_reward_func/std": 0.08602513372898102, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 332.96875, "completions/mean_terminated_length": 282.8399963378906, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.7136, "grad_norm": 2.8855247497558594, "kl": 0.0643310546875, "learning_rate": 1e-06, "loss": 0.1736, "num_tokens": 11625662.0, "reward": 0.03828759491443634, "reward_std": 0.024871867150068283, "rewards/bleu_reward_func/mean": 0.03828759491443634, "rewards/bleu_reward_func/std": 0.03181852772831917, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 322.96875, "completions/mean_terminated_length": 223.952392578125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.7144, "grad_norm": 8.224991798400879, "kl": 0.19036865234375, "learning_rate": 1e-06, "loss": 0.1601, "num_tokens": 11642029.0, "reward": 0.03835766017436981, "reward_std": 0.013130895793437958, "rewards/bleu_reward_func/mean": 0.03835766017436981, "rewards/bleu_reward_func/std": 0.024478256702423096, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 205.46875, "completions/mean_terminated_length": 148.70370483398438, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7152, "grad_norm": 6.305431842803955, "kl": 0.14776611328125, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 11657452.0, "reward": 0.11420266330242157, "reward_std": 0.04108916223049164, "rewards/bleu_reward_func/mean": 0.11420266330242157, "rewards/bleu_reward_func/std": 0.06337518244981766, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 336.71875, "completions/mean_terminated_length": 200.38888549804688, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.716, "grad_norm": 4.193768501281738, "kl": 0.13232421875, "learning_rate": 1e-06, "loss": 0.3397, "num_tokens": 11672971.0, "reward": 0.07947193086147308, "reward_std": 0.04811304062604904, "rewards/bleu_reward_func/mean": 0.07947193086147308, "rewards/bleu_reward_func/std": 0.10142233967781067, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 123.09375, "completions/mean_terminated_length": 97.16667175292969, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7168, "grad_norm": 9.166439056396484, "kl": 0.28271484375, "learning_rate": 1e-06, "loss": -0.1089, "num_tokens": 11684334.0, "reward": 0.27329233288764954, "reward_std": 0.059711530804634094, "rewards/bleu_reward_func/mean": 0.27329233288764954, "rewards/bleu_reward_func/std": 0.1879579871892929, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 157.33334350585938, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7176, "grad_norm": 4.767898082733154, "kl": 0.112060546875, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 11700486.0, "reward": 0.07844039797782898, "reward_std": 0.034808725118637085, "rewards/bleu_reward_func/mean": 0.07844039797782898, "rewards/bleu_reward_func/std": 0.0884510949254036, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 140.2105255126953, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7184, "grad_norm": 5.6404266357421875, "kl": 0.126312255859375, "learning_rate": 1e-06, "loss": 0.0453, "num_tokens": 11712438.0, "reward": 0.07097087800502777, "reward_std": 0.03667715564370155, "rewards/bleu_reward_func/mean": 0.07097087800502777, "rewards/bleu_reward_func/std": 0.08086320012807846, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 264.6875, "completions/mean_terminated_length": 229.35714721679688, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7192, "grad_norm": 4.8742499351501465, "kl": 0.10357666015625, "learning_rate": 1e-06, "loss": 0.1854, "num_tokens": 11725396.0, "reward": 0.21620362997055054, "reward_std": 0.07608456909656525, "rewards/bleu_reward_func/mean": 0.21620362997055054, "rewards/bleu_reward_func/std": 0.2514094114303589, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 329.46875, "completions/mean_terminated_length": 268.625, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.72, "grad_norm": 4.756425857543945, "kl": 0.10198974609375, "learning_rate": 1e-06, "loss": 0.0469, "num_tokens": 11738307.0, "reward": 0.053835704922676086, "reward_std": 0.012895071879029274, "rewards/bleu_reward_func/mean": 0.053835704922676086, "rewards/bleu_reward_func/std": 0.03540419042110443, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 401.65625, "completions/mean_terminated_length": 259.7857360839844, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.7208, "grad_norm": 3.1605112552642822, "kl": 0.04974365234375, "learning_rate": 1e-06, "loss": 0.2297, "num_tokens": 11756496.0, "reward": 0.24388237297534943, "reward_std": 0.1161736249923706, "rewards/bleu_reward_func/mean": 0.24388237297534943, "rewards/bleu_reward_func/std": 0.3413524627685547, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 271.59375, "completions/mean_terminated_length": 246.72413635253906, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7216, "grad_norm": 4.188157081604004, "kl": 0.1815185546875, "learning_rate": 1e-06, "loss": -0.0606, "num_tokens": 11768691.0, "reward": 0.12793870270252228, "reward_std": 0.04022746905684471, "rewards/bleu_reward_func/mean": 0.12793870270252228, "rewards/bleu_reward_func/std": 0.15937677025794983, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 282.65625, "completions/mean_terminated_length": 178.4091033935547, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.7224, "grad_norm": 4.868112087249756, "kl": 0.1282958984375, "learning_rate": 1e-06, "loss": -0.0395, "num_tokens": 11780800.0, "reward": 0.09620735794305801, "reward_std": 0.021982625126838684, "rewards/bleu_reward_func/mean": 0.09620735794305801, "rewards/bleu_reward_func/std": 0.07161340862512589, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 277.09375, "completions/mean_terminated_length": 222.88462829589844, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.7232, "grad_norm": 7.519046783447266, "kl": 0.212158203125, "learning_rate": 1e-06, "loss": -0.1487, "num_tokens": 11795403.0, "reward": 0.07747071981430054, "reward_std": 0.03376290947198868, "rewards/bleu_reward_func/mean": 0.07747071981430054, "rewards/bleu_reward_func/std": 0.055931881070137024, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 253.4375, "completions/mean_terminated_length": 167.25, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.724, "grad_norm": 8.574625015258789, "kl": 0.1490478515625, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 11806985.0, "reward": 0.06105317175388336, "reward_std": 0.019554441794753075, "rewards/bleu_reward_func/mean": 0.06105317175388336, "rewards/bleu_reward_func/std": 0.038146011531353, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 72.1875, "completions/mean_terminated_length": 58.0, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7248, "grad_norm": 10.871319770812988, "kl": 0.32177734375, "learning_rate": 1e-06, "loss": 0.2813, "num_tokens": 11812335.0, "reward": 0.09286689758300781, "reward_std": 0.02634507045149803, "rewards/bleu_reward_func/mean": 0.09286689758300781, "rewards/bleu_reward_func/std": 0.04922043904662132, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 239.3125, "completions/mean_terminated_length": 200.35714721679688, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7256, "grad_norm": 23.129505157470703, "kl": 0.1500244140625, "learning_rate": 1e-06, "loss": 0.0482, "num_tokens": 11828945.0, "reward": 0.07308115810155869, "reward_std": 0.014882557094097137, "rewards/bleu_reward_func/mean": 0.07308115810155869, "rewards/bleu_reward_func/std": 0.08316269516944885, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 429.8125, "completions/mean_terminated_length": 292.8333435058594, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.7264, "grad_norm": 11.226503372192383, "kl": 0.059967041015625, "learning_rate": 1e-06, "loss": 0.0269, "num_tokens": 11848283.0, "reward": 0.04945487529039383, "reward_std": 0.016689039766788483, "rewards/bleu_reward_func/mean": 0.04945487529039383, "rewards/bleu_reward_func/std": 0.04881744086742401, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 225.09375, "completions/mean_terminated_length": 129.45834350585938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7272, "grad_norm": 8.629831314086914, "kl": 0.2176055908203125, "learning_rate": 1e-06, "loss": 0.5547, "num_tokens": 11858854.0, "reward": 0.14837728440761566, "reward_std": 0.06372867524623871, "rewards/bleu_reward_func/mean": 0.14837728440761566, "rewards/bleu_reward_func/std": 0.18777750432491302, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 74.25, "completions/mean_terminated_length": 74.25, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.728, "grad_norm": 13.281477928161621, "kl": 0.438720703125, "learning_rate": 1e-06, "loss": 0.0534, "num_tokens": 11869046.0, "reward": 0.20467601716518402, "reward_std": 0.04131526127457619, "rewards/bleu_reward_func/mean": 0.20467601716518402, "rewards/bleu_reward_func/std": 0.14035604894161224, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 341.625, "completions/mean_terminated_length": 148.53334045410156, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7288, "grad_norm": 6.93421745300293, "kl": 0.177642822265625, "learning_rate": 1e-06, "loss": 0.0368, "num_tokens": 11883586.0, "reward": 0.18407613039016724, "reward_std": 0.020998071879148483, "rewards/bleu_reward_func/mean": 0.18407613039016724, "rewards/bleu_reward_func/std": 0.2021336704492569, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 222.1875, "completions/mean_terminated_length": 168.51852416992188, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7296, "grad_norm": 4.430690765380859, "kl": 0.18560791015625, "learning_rate": 1e-06, "loss": 0.1092, "num_tokens": 11894448.0, "reward": 0.1439959555864334, "reward_std": 0.04086273908615112, "rewards/bleu_reward_func/mean": 0.1439959555864334, "rewards/bleu_reward_func/std": 0.1705217957496643, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 397.25, "completions/mean_terminated_length": 345.0909118652344, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7304, "grad_norm": 2.7031519412994385, "kl": 0.05963134765625, "learning_rate": 1e-06, "loss": 0.0584, "num_tokens": 11909408.0, "reward": 0.09079495072364807, "reward_std": 0.021243298426270485, "rewards/bleu_reward_func/mean": 0.09079495072364807, "rewards/bleu_reward_func/std": 0.1052529513835907, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 282.9375, "completions/mean_terminated_length": 126.21052551269531, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7312, "grad_norm": 6.302535057067871, "kl": 0.17462158203125, "learning_rate": 1e-06, "loss": -0.1643, "num_tokens": 11923182.0, "reward": 0.10389965772628784, "reward_std": 0.03838275372982025, "rewards/bleu_reward_func/mean": 0.10389965772628784, "rewards/bleu_reward_func/std": 0.10838860273361206, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 411.4375, "completions/mean_terminated_length": 282.14288330078125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.732, "grad_norm": 2.603992223739624, "kl": 0.052398681640625, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 11939596.0, "reward": 0.06734529137611389, "reward_std": 0.0207513514906168, "rewards/bleu_reward_func/mean": 0.06734529137611389, "rewards/bleu_reward_func/std": 0.05821956321597099, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 111.90625, "completions/mean_terminated_length": 111.90625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.7328, "grad_norm": 15.395035743713379, "kl": 0.2227783203125, "learning_rate": 1e-06, "loss": 0.3837, "num_tokens": 11947113.0, "reward": 0.24101027846336365, "reward_std": 0.07465855032205582, "rewards/bleu_reward_func/mean": 0.24101027846336365, "rewards/bleu_reward_func/std": 0.17581383883953094, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 312.90625, "completions/mean_terminated_length": 176.68421936035156, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.7336, "grad_norm": 8.670394897460938, "kl": 0.054595947265625, "learning_rate": 1e-06, "loss": -0.1736, "num_tokens": 11965846.0, "reward": 0.2349693477153778, "reward_std": 0.042255695909261703, "rewards/bleu_reward_func/mean": 0.2349693477153778, "rewards/bleu_reward_func/std": 0.37363162636756897, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 384.34375, "completions/mean_terminated_length": 317.4761962890625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.7344, "grad_norm": 2.5599606037139893, "kl": 0.06549072265625, "learning_rate": 1e-06, "loss": -0.0089, "num_tokens": 11980945.0, "reward": 0.061901748180389404, "reward_std": 0.02856561914086342, "rewards/bleu_reward_func/mean": 0.061901748180389404, "rewards/bleu_reward_func/std": 0.04196527600288391, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 203.9375, "completions/mean_terminated_length": 146.88888549804688, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7352, "grad_norm": 4.163308143615723, "kl": 0.08990478515625, "learning_rate": 1e-06, "loss": -0.0627, "num_tokens": 11994255.0, "reward": 0.06285493075847626, "reward_std": 0.027241935953497887, "rewards/bleu_reward_func/mean": 0.06285493075847626, "rewards/bleu_reward_func/std": 0.03245123475790024, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 445.90625, "completions/mean_terminated_length": 406.25, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.736, "grad_norm": 2.6394991874694824, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": -0.0646, "num_tokens": 12010980.0, "reward": 0.05676237493753433, "reward_std": 0.014085400849580765, "rewards/bleu_reward_func/mean": 0.05676237493753433, "rewards/bleu_reward_func/std": 0.03611414507031441, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 106.0625, "completions/mean_terminated_length": 92.96774291992188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.7368, "grad_norm": 8.88719654083252, "kl": 0.19818115234375, "learning_rate": 1e-06, "loss": -0.1221, "num_tokens": 12017206.0, "reward": 0.08727812767028809, "reward_std": 0.05162365734577179, "rewards/bleu_reward_func/mean": 0.08727812767028809, "rewards/bleu_reward_func/std": 0.07182831317186356, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 162.48275756835938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7376, "grad_norm": 7.483645439147949, "kl": 0.206787109375, "learning_rate": 1e-06, "loss": -0.0919, "num_tokens": 12028214.0, "reward": 0.18070882558822632, "reward_std": 0.04944847524166107, "rewards/bleu_reward_func/mean": 0.18070882558822632, "rewards/bleu_reward_func/std": 0.19004972279071808, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 285.75, "completions/mean_terminated_length": 130.94737243652344, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.7384, "grad_norm": 4.784849643707275, "kl": 0.07586669921875, "learning_rate": 1e-06, "loss": 0.131, "num_tokens": 12042278.0, "reward": 0.05333679914474487, "reward_std": 0.03152618184685707, "rewards/bleu_reward_func/mean": 0.05333679914474487, "rewards/bleu_reward_func/std": 0.055619917809963226, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 300.0625, "completions/mean_terminated_length": 27.571430206298828, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7392, "grad_norm": 5.440861701965332, "kl": 0.145477294921875, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 12056416.0, "reward": 0.14792697131633759, "reward_std": 0.02701294980943203, "rewards/bleu_reward_func/mean": 0.14792697131633759, "rewards/bleu_reward_func/std": 0.15142259001731873, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 286.15625, "completions/mean_terminated_length": 271.1000061035156, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.74, "grad_norm": 3.949329137802124, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.051, "num_tokens": 12069725.0, "reward": 0.07858790457248688, "reward_std": 0.02233020029962063, "rewards/bleu_reward_func/mean": 0.07858790457248688, "rewards/bleu_reward_func/std": 0.07242675125598907, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 89.10344696044922, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7408, "grad_norm": 11.58963394165039, "kl": 0.43695068359375, "learning_rate": 1e-06, "loss": 0.0795, "num_tokens": 12080085.0, "reward": 0.11042273789644241, "reward_std": 0.017381731420755386, "rewards/bleu_reward_func/mean": 0.11042273789644241, "rewards/bleu_reward_func/std": 0.04870026186108589, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 295.40625, "completions/mean_terminated_length": 126.94444274902344, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7416, "grad_norm": 9.117962837219238, "kl": 0.22723388671875, "learning_rate": 1e-06, "loss": -0.4999, "num_tokens": 12092682.0, "reward": 0.03860364854335785, "reward_std": 0.019805099815130234, "rewards/bleu_reward_func/mean": 0.03860364854335785, "rewards/bleu_reward_func/std": 0.024968957528471947, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 297.71875, "completions/mean_terminated_length": 200.3181915283203, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.7424, "grad_norm": 3.2410967350006104, "kl": 0.1171875, "learning_rate": 1e-06, "loss": -0.0166, "num_tokens": 12107465.0, "reward": 0.16988132894039154, "reward_std": 0.03467182815074921, "rewards/bleu_reward_func/mean": 0.16988132894039154, "rewards/bleu_reward_func/std": 0.1373591423034668, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 355.4375, "completions/mean_terminated_length": 233.6666717529297, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7432, "grad_norm": 14.617587089538574, "kl": 0.15447998046875, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 12121815.0, "reward": 0.14590570330619812, "reward_std": 0.026923291385173798, "rewards/bleu_reward_func/mean": 0.14590570330619812, "rewards/bleu_reward_func/std": 0.2141415923833847, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 262.78125, "completions/mean_terminated_length": 227.1785888671875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.744, "grad_norm": 3.273005962371826, "kl": 0.1136474609375, "learning_rate": 1e-06, "loss": -0.2043, "num_tokens": 12136200.0, "reward": 0.08558979630470276, "reward_std": 0.03035646863281727, "rewards/bleu_reward_func/mean": 0.08558979630470276, "rewards/bleu_reward_func/std": 0.0643271803855896, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 285.78125, "completions/mean_terminated_length": 150.0500030517578, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.7448, "grad_norm": 6.722025394439697, "kl": 0.117340087890625, "learning_rate": 1e-06, "loss": 0.0744, "num_tokens": 12147369.0, "reward": 0.071571946144104, "reward_std": 0.020615192130208015, "rewards/bleu_reward_func/mean": 0.071571946144104, "rewards/bleu_reward_func/std": 0.06541716307401657, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 391.03125, "completions/mean_terminated_length": 357.1600036621094, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.7456, "grad_norm": 2.371354103088379, "kl": 0.039031982421875, "learning_rate": 1e-06, "loss": -0.0517, "num_tokens": 12165498.0, "reward": 0.07511453330516815, "reward_std": 0.020994337275624275, "rewards/bleu_reward_func/mean": 0.07511453330516815, "rewards/bleu_reward_func/std": 0.043336208909749985, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 321.59375, "completions/mean_terminated_length": 235.0454559326172, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.7464, "grad_norm": 2.6573996543884277, "kl": 0.0595703125, "learning_rate": 1e-06, "loss": 0.2007, "num_tokens": 12178413.0, "reward": 0.07766060531139374, "reward_std": 0.030490310862660408, "rewards/bleu_reward_func/mean": 0.07766060531139374, "rewards/bleu_reward_func/std": 0.05291305482387543, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 389.9375, "completions/mean_terminated_length": 326.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.7472, "grad_norm": 2.6120660305023193, "kl": 0.0733642578125, "learning_rate": 1e-06, "loss": 0.0789, "num_tokens": 12194507.0, "reward": 0.10922634601593018, "reward_std": 0.0321655347943306, "rewards/bleu_reward_func/mean": 0.10922634601593018, "rewards/bleu_reward_func/std": 0.10983148962259293, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 228.21875, "completions/mean_terminated_length": 198.86207580566406, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.748, "grad_norm": 7.4974236488342285, "kl": 0.24505615234375, "learning_rate": 1e-06, "loss": -0.0333, "num_tokens": 12206842.0, "reward": 0.16729718446731567, "reward_std": 0.050741568207740784, "rewards/bleu_reward_func/mean": 0.16729718446731567, "rewards/bleu_reward_func/std": 0.2129126340150833, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 248.5625, "completions/mean_terminated_length": 90.5, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7488, "grad_norm": 8.694432258605957, "kl": 0.124114990234375, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 12216596.0, "reward": 0.10160160809755325, "reward_std": 0.03439757227897644, "rewards/bleu_reward_func/mean": 0.10160160809755325, "rewards/bleu_reward_func/std": 0.06317181140184402, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 79.04000091552734, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.7496, "grad_norm": 7.92829704284668, "kl": 0.2811279296875, "learning_rate": 1e-06, "loss": -0.2142, "num_tokens": 12226292.0, "reward": 0.11500123143196106, "reward_std": 0.030234824866056442, "rewards/bleu_reward_func/mean": 0.11500123143196106, "rewards/bleu_reward_func/std": 0.12273158878087997, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 234.875, "completions/mean_terminated_length": 170.92308044433594, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7504, "grad_norm": 8.183011054992676, "kl": 0.261383056640625, "learning_rate": 1e-06, "loss": 0.017, "num_tokens": 12238216.0, "reward": 0.39511436223983765, "reward_std": 0.1106102392077446, "rewards/bleu_reward_func/mean": 0.39511436223983765, "rewards/bleu_reward_func/std": 0.3091021776199341, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 315.6875, "completions/mean_terminated_length": 163.0, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.7512, "grad_norm": 4.409646511077881, "kl": 0.092132568359375, "learning_rate": 1e-06, "loss": -0.0114, "num_tokens": 12254990.0, "reward": 0.1955508440732956, "reward_std": 0.016137830913066864, "rewards/bleu_reward_func/mean": 0.1955508440732956, "rewards/bleu_reward_func/std": 0.26973703503608704, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 276.875, "completions/mean_terminated_length": 69.4117660522461, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.752, "grad_norm": 7.45650577545166, "kl": 0.2735595703125, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 12268242.0, "reward": 0.07165145874023438, "reward_std": 0.020489612594246864, "rewards/bleu_reward_func/mean": 0.07165145874023438, "rewards/bleu_reward_func/std": 0.04259462654590607, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 150.8125, "completions/mean_terminated_length": 113.44827270507812, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7528, "grad_norm": 10.180867195129395, "kl": 0.4688720703125, "learning_rate": 1e-06, "loss": 0.122, "num_tokens": 12276916.0, "reward": 0.15257704257965088, "reward_std": 0.051439568400382996, "rewards/bleu_reward_func/mean": 0.15257704257965088, "rewards/bleu_reward_func/std": 0.11688338220119476, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 176.21739196777344, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7536, "grad_norm": 5.519646167755127, "kl": 0.07073974609375, "learning_rate": 1e-06, "loss": 0.437, "num_tokens": 12291593.0, "reward": 0.06806058436632156, "reward_std": 0.05050808936357498, "rewards/bleu_reward_func/mean": 0.06806058436632156, "rewards/bleu_reward_func/std": 0.06130353361368179, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 405.625, "completions/mean_terminated_length": 375.8399963378906, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7544, "grad_norm": 1.8502905368804932, "kl": 0.0356292724609375, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 12310893.0, "reward": 0.21372246742248535, "reward_std": 0.0709368884563446, "rewards/bleu_reward_func/mean": 0.21372246742248535, "rewards/bleu_reward_func/std": 0.1763986349105835, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 245.625, "completions/mean_terminated_length": 141.3913116455078, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.7552, "grad_norm": 3.3902478218078613, "kl": 0.05377197265625, "learning_rate": 1e-06, "loss": 0.076, "num_tokens": 12323369.0, "reward": 0.0433628112077713, "reward_std": 0.03261272981762886, "rewards/bleu_reward_func/mean": 0.0433628112077713, "rewards/bleu_reward_func/std": 0.0436432845890522, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 270.0625, "completions/mean_terminated_length": 189.4166717529297, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.756, "grad_norm": 5.770748615264893, "kl": 0.088531494140625, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 12333803.0, "reward": 0.10272043943405151, "reward_std": 0.03215545043349266, "rewards/bleu_reward_func/mean": 0.10272043943405151, "rewards/bleu_reward_func/std": 0.11694183200597763, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 194.48275756835938, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7568, "grad_norm": 7.7613067626953125, "kl": 0.20135498046875, "learning_rate": 1e-06, "loss": 0.1591, "num_tokens": 12344915.0, "reward": 0.08291373401880264, "reward_std": 0.024335253983736038, "rewards/bleu_reward_func/mean": 0.08291373401880264, "rewards/bleu_reward_func/std": 0.03890189528465271, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 455.6875, "completions/mean_terminated_length": 373.3846435546875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7576, "grad_norm": 2.599454879760742, "kl": 0.0421142578125, "learning_rate": 1e-06, "loss": -0.0235, "num_tokens": 12362161.0, "reward": 0.03875226154923439, "reward_std": 0.020147912204265594, "rewards/bleu_reward_func/mean": 0.03875226154923439, "rewards/bleu_reward_func/std": 0.023408547043800354, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 125.3846206665039, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.7584, "grad_norm": 6.522151947021484, "kl": 0.1690673828125, "learning_rate": 1e-06, "loss": 0.1959, "num_tokens": 12375709.0, "reward": 0.2021377682685852, "reward_std": 0.0921662300825119, "rewards/bleu_reward_func/mean": 0.2021377682685852, "rewards/bleu_reward_func/std": 0.28283461928367615, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 256.21875, "completions/mean_terminated_length": 122.23809814453125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.7592, "grad_norm": 5.671032905578613, "kl": 0.1173095703125, "learning_rate": 1e-06, "loss": -0.0303, "num_tokens": 12385764.0, "reward": 0.0564446821808815, "reward_std": 0.02071106806397438, "rewards/bleu_reward_func/mean": 0.0564446821808815, "rewards/bleu_reward_func/std": 0.030088067054748535, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 255.59375, "completions/mean_terminated_length": 229.0689697265625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.76, "grad_norm": 6.897347927093506, "kl": 0.2220458984375, "learning_rate": 1e-06, "loss": 0.1234, "num_tokens": 12396999.0, "reward": 0.09963001310825348, "reward_std": 0.05010713264346123, "rewards/bleu_reward_func/mean": 0.09963001310825348, "rewards/bleu_reward_func/std": 0.08052106946706772, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 456.5, "completions/mean_terminated_length": 418.52630615234375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.7608, "grad_norm": 2.444167137145996, "kl": 0.04534912109375, "learning_rate": 1e-06, "loss": -0.0097, "num_tokens": 12418039.0, "reward": 0.03447666019201279, "reward_std": 0.01355208083987236, "rewards/bleu_reward_func/mean": 0.03447666019201279, "rewards/bleu_reward_func/std": 0.022434458136558533, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 279.5625, "completions/mean_terminated_length": 120.52631378173828, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7616, "grad_norm": 8.821101188659668, "kl": 0.091888427734375, "learning_rate": 1e-06, "loss": 0.1929, "num_tokens": 12431177.0, "reward": 0.11506980657577515, "reward_std": 0.033062804490327835, "rewards/bleu_reward_func/mean": 0.11506980657577515, "rewards/bleu_reward_func/std": 0.0943976491689682, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 269.78125, "completions/mean_terminated_length": 201.95999145507812, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.7624, "grad_norm": 6.004775524139404, "kl": 0.07818603515625, "learning_rate": 1e-06, "loss": -0.1255, "num_tokens": 12445970.0, "reward": 0.09985020756721497, "reward_std": 0.0198547150939703, "rewards/bleu_reward_func/mean": 0.09985020756721497, "rewards/bleu_reward_func/std": 0.08852815628051758, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 318.8125, "completions/mean_terminated_length": 231.0, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.7632, "grad_norm": 6.8956804275512695, "kl": 0.175567626953125, "learning_rate": 1e-06, "loss": 0.0679, "num_tokens": 12458004.0, "reward": 0.1692689061164856, "reward_std": 0.03958010673522949, "rewards/bleu_reward_func/mean": 0.1692689061164856, "rewards/bleu_reward_func/std": 0.13855873048305511, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 250.9375, "completions/mean_terminated_length": 148.78260803222656, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.764, "grad_norm": 6.749716758728027, "kl": 0.293182373046875, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 12472626.0, "reward": 0.1224028617143631, "reward_std": 0.027801956981420517, "rewards/bleu_reward_func/mean": 0.1224028617143631, "rewards/bleu_reward_func/std": 0.07426659762859344, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 147.6875, "completions/mean_terminated_length": 123.40000915527344, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7648, "grad_norm": 9.662991523742676, "kl": 0.46905517578125, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 12481928.0, "reward": 0.23031684756278992, "reward_std": 0.0920054167509079, "rewards/bleu_reward_func/mean": 0.23031684756278992, "rewards/bleu_reward_func/std": 0.16612249612808228, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 258.65625, "completions/mean_terminated_length": 106.6500015258789, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7656, "grad_norm": 10.25383472442627, "kl": 0.2713623046875, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 12493805.0, "reward": 0.15187731385231018, "reward_std": 0.025371436029672623, "rewards/bleu_reward_func/mean": 0.15187731385231018, "rewards/bleu_reward_func/std": 0.12905065715312958, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 85.40625, "completions/mean_terminated_length": 71.64515686035156, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.7664, "grad_norm": 8.22080135345459, "kl": 0.393310546875, "learning_rate": 1e-06, "loss": 0.3247, "num_tokens": 12502818.0, "reward": 0.29921823740005493, "reward_std": 0.11694261431694031, "rewards/bleu_reward_func/mean": 0.29921823740005493, "rewards/bleu_reward_func/std": 0.25639036297798157, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 386.09375, "completions/mean_terminated_length": 260.1875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.7672, "grad_norm": 2.847195863723755, "kl": 0.0467529296875, "learning_rate": 1e-06, "loss": 0.0852, "num_tokens": 12520317.0, "reward": 0.039544593542814255, "reward_std": 0.016648683696985245, "rewards/bleu_reward_func/mean": 0.039544593542814255, "rewards/bleu_reward_func/std": 0.034897446632385254, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 111.4375, "completions/mean_terminated_length": 111.4375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.768, "grad_norm": 8.353513717651367, "kl": 0.22430419921875, "learning_rate": 1e-06, "loss": 0.2469, "num_tokens": 12530027.0, "reward": 0.26215416193008423, "reward_std": 0.032358862459659576, "rewards/bleu_reward_func/mean": 0.26215416193008423, "rewards/bleu_reward_func/std": 0.22925570607185364, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 456.6875, "completions/mean_terminated_length": 315.3333435058594, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.7688, "grad_norm": 2.2755558490753174, "kl": 0.033355712890625, "learning_rate": 1e-06, "loss": 0.1802, "num_tokens": 12548593.0, "reward": 0.02548890933394432, "reward_std": 0.01250866986811161, "rewards/bleu_reward_func/mean": 0.02548890933394432, "rewards/bleu_reward_func/std": 0.0143959391862154, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 359.28125, "completions/mean_terminated_length": 206.5625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7696, "grad_norm": 5.7301130294799805, "kl": 0.165802001953125, "learning_rate": 1e-06, "loss": 0.0633, "num_tokens": 12566850.0, "reward": 0.09463217854499817, "reward_std": 0.021320462226867676, "rewards/bleu_reward_func/mean": 0.09463217854499817, "rewards/bleu_reward_func/std": 0.10299301147460938, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 388.125, "completions/mean_terminated_length": 313.8000183105469, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.7704, "grad_norm": 2.5917232036590576, "kl": 0.07171630859375, "learning_rate": 1e-06, "loss": 0.1643, "num_tokens": 12581438.0, "reward": 0.06743638217449188, "reward_std": 0.041416820138692856, "rewards/bleu_reward_func/mean": 0.06743638217449188, "rewards/bleu_reward_func/std": 0.0745474174618721, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 173.53125, "completions/mean_terminated_length": 162.61289978027344, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7712, "grad_norm": 5.022093772888184, "kl": 0.110015869140625, "learning_rate": 1e-06, "loss": 0.1948, "num_tokens": 12593087.0, "reward": 0.14474597573280334, "reward_std": 0.039374105632305145, "rewards/bleu_reward_func/mean": 0.14474597573280334, "rewards/bleu_reward_func/std": 0.0781283900141716, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 129.53125, "completions/mean_terminated_length": 129.53125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.772, "grad_norm": 5.708756446838379, "kl": 0.185546875, "learning_rate": 1e-06, "loss": -0.112, "num_tokens": 12604360.0, "reward": 0.17942924797534943, "reward_std": 0.04769964888691902, "rewards/bleu_reward_func/mean": 0.17942924797534943, "rewards/bleu_reward_func/std": 0.20441435277462006, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 143.15625, "completions/mean_terminated_length": 39.87999725341797, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7728, "grad_norm": 10.111825942993164, "kl": 0.39947509765625, "learning_rate": 1e-06, "loss": -0.0058, "num_tokens": 12613941.0, "reward": 0.09816907346248627, "reward_std": 0.009084422141313553, "rewards/bleu_reward_func/mean": 0.09816907346248627, "rewards/bleu_reward_func/std": 0.06435907632112503, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 348.875, "completions/mean_terminated_length": 110.46154022216797, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7736, "grad_norm": 4.259431838989258, "kl": 0.144195556640625, "learning_rate": 1e-06, "loss": 0.1364, "num_tokens": 12628633.0, "reward": 0.11497487127780914, "reward_std": 0.0397370383143425, "rewards/bleu_reward_func/mean": 0.11497487127780914, "rewards/bleu_reward_func/std": 0.08479689061641693, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 100.53334045410156, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.7744, "grad_norm": 5.853184700012207, "kl": 0.2406005859375, "learning_rate": 1e-06, "loss": 0.0855, "num_tokens": 12642293.0, "reward": 0.1797194480895996, "reward_std": 0.06623274832963943, "rewards/bleu_reward_func/mean": 0.1797194480895996, "rewards/bleu_reward_func/std": 0.20514002442359924, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 276.65625, "completions/mean_terminated_length": 198.20834350585938, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7752, "grad_norm": 3.861785650253296, "kl": 0.048126220703125, "learning_rate": 1e-06, "loss": 0.3173, "num_tokens": 12653674.0, "reward": 0.06044634059071541, "reward_std": 0.02236868627369404, "rewards/bleu_reward_func/mean": 0.06044634059071541, "rewards/bleu_reward_func/std": 0.058306269347667694, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 364.4375, "completions/mean_terminated_length": 287.1428527832031, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.776, "grad_norm": 2.8329427242279053, "kl": 0.037445068359375, "learning_rate": 1e-06, "loss": 0.1258, "num_tokens": 12668128.0, "reward": 0.11927121132612228, "reward_std": 0.0374884158372879, "rewards/bleu_reward_func/mean": 0.11927121132612228, "rewards/bleu_reward_func/std": 0.10864724963903427, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 247.3125, "completions/mean_terminated_length": 198.29629516601562, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.7768, "grad_norm": 14.522860527038574, "kl": 0.3580322265625, "learning_rate": 1e-06, "loss": 0.327, "num_tokens": 12680266.0, "reward": 0.11901617795228958, "reward_std": 0.06829790771007538, "rewards/bleu_reward_func/mean": 0.11901617795228958, "rewards/bleu_reward_func/std": 0.0924401804804802, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 270.71875, "completions/mean_terminated_length": 83.05555725097656, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.7776, "grad_norm": 5.036475658416748, "kl": 0.187713623046875, "learning_rate": 1e-06, "loss": 0.1832, "num_tokens": 12692713.0, "reward": 0.13872118294239044, "reward_std": 0.0687481164932251, "rewards/bleu_reward_func/mean": 0.13872118294239044, "rewards/bleu_reward_func/std": 0.14044460654258728, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 244.90625, "completions/mean_terminated_length": 155.875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.7784, "grad_norm": 7.775391578674316, "kl": 0.136444091796875, "learning_rate": 1e-06, "loss": 0.0436, "num_tokens": 12706702.0, "reward": 0.11104710400104523, "reward_std": 0.03642675280570984, "rewards/bleu_reward_func/mean": 0.11104710400104523, "rewards/bleu_reward_func/std": 0.09262983500957489, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 293.96875, "completions/mean_terminated_length": 194.8636474609375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.7792, "grad_norm": 36.50672149658203, "kl": 1.153228759765625, "learning_rate": 1e-06, "loss": 0.1761, "num_tokens": 12721461.0, "reward": 0.2013707458972931, "reward_std": 0.10103052109479904, "rewards/bleu_reward_func/mean": 0.2013707458972931, "rewards/bleu_reward_func/std": 0.16323500871658325, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 128.55172729492188, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.78, "grad_norm": 9.472455024719238, "kl": 0.2548828125, "learning_rate": 1e-06, "loss": 0.6259, "num_tokens": 12733317.0, "reward": 0.07841520756483078, "reward_std": 0.029445767402648926, "rewards/bleu_reward_func/mean": 0.07841520756483078, "rewards/bleu_reward_func/std": 0.0620783306658268, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 247.09375, "completions/mean_terminated_length": 229.433349609375, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.7808, "grad_norm": 4.399056911468506, "kl": 0.10150146484375, "learning_rate": 1e-06, "loss": 0.0484, "num_tokens": 12746712.0, "reward": 0.20041930675506592, "reward_std": 0.03039298765361309, "rewards/bleu_reward_func/mean": 0.20041930675506592, "rewards/bleu_reward_func/std": 0.2551174759864807, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 382.375, "completions/mean_terminated_length": 293.6842041015625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.7816, "grad_norm": 2.5359840393066406, "kl": 0.05230712890625, "learning_rate": 1e-06, "loss": 0.0699, "num_tokens": 12762020.0, "reward": 0.047146447002887726, "reward_std": 0.022996241226792336, "rewards/bleu_reward_func/mean": 0.047146447002887726, "rewards/bleu_reward_func/std": 0.04002131521701813, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 329.90625, "completions/mean_terminated_length": 234.52381896972656, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.7824, "grad_norm": 6.270121097564697, "kl": 0.1246337890625, "learning_rate": 1e-06, "loss": -0.0807, "num_tokens": 12778233.0, "reward": 0.1260184496641159, "reward_std": 0.029545176774263382, "rewards/bleu_reward_func/mean": 0.1260184496641159, "rewards/bleu_reward_func/std": 0.12758195400238037, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 127.6551742553711, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.7832, "grad_norm": 11.418375015258789, "kl": 0.1888427734375, "learning_rate": 1e-06, "loss": 0.4229, "num_tokens": 12788943.0, "reward": 0.22336477041244507, "reward_std": 0.0984843298792839, "rewards/bleu_reward_func/mean": 0.22336477041244507, "rewards/bleu_reward_func/std": 0.1921825110912323, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 141.6875, "completions/mean_terminated_length": 117.00000762939453, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.784, "grad_norm": 7.894710540771484, "kl": 0.4378662109375, "learning_rate": 1e-06, "loss": 0.2326, "num_tokens": 12797693.0, "reward": 0.1561349630355835, "reward_std": 0.05494026839733124, "rewards/bleu_reward_func/mean": 0.1561349630355835, "rewards/bleu_reward_func/std": 0.09167517721652985, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 255.34375, "completions/mean_terminated_length": 169.7916717529297, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.7848, "grad_norm": 19.572107315063477, "kl": 0.288604736328125, "learning_rate": 1e-06, "loss": 0.151, "num_tokens": 12812768.0, "reward": 0.06323938816785812, "reward_std": 0.017744949087500572, "rewards/bleu_reward_func/mean": 0.06323938816785812, "rewards/bleu_reward_func/std": 0.07885830849409103, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 281.65625, "completions/mean_terminated_length": 176.9545440673828, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.7856, "grad_norm": 8.172053337097168, "kl": 0.24444580078125, "learning_rate": 1e-06, "loss": 0.21, "num_tokens": 12829453.0, "reward": 0.0720784068107605, "reward_std": 0.03868547081947327, "rewards/bleu_reward_func/mean": 0.0720784068107605, "rewards/bleu_reward_func/std": 0.05159585550427437, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 235.84375, "completions/mean_terminated_length": 110.31818389892578, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7864, "grad_norm": 8.912215232849121, "kl": 0.32025146484375, "learning_rate": 1e-06, "loss": 0.12, "num_tokens": 12843376.0, "reward": 0.1997315138578415, "reward_std": 0.030267415568232536, "rewards/bleu_reward_func/mean": 0.1997315138578415, "rewards/bleu_reward_func/std": 0.17783835530281067, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 378.0, "completions/mean_terminated_length": 317.0909118652344, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.7872, "grad_norm": 2.6129956245422363, "kl": 0.036346435546875, "learning_rate": 1e-06, "loss": -0.1504, "num_tokens": 12859368.0, "reward": 0.07119783759117126, "reward_std": 0.018479108810424805, "rewards/bleu_reward_func/mean": 0.07119783759117126, "rewards/bleu_reward_func/std": 0.06165986508131027, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 220.03125, "completions/mean_terminated_length": 152.6538543701172, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.788, "grad_norm": 5.786254405975342, "kl": 0.15057373046875, "learning_rate": 1e-06, "loss": -0.1709, "num_tokens": 12869665.0, "reward": 0.14459165930747986, "reward_std": 0.03573929890990257, "rewards/bleu_reward_func/mean": 0.14459165930747986, "rewards/bleu_reward_func/std": 0.13286592066287994, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 357.125, "completions/mean_terminated_length": 220.47059631347656, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.7888, "grad_norm": 8.607572555541992, "kl": 0.24078369140625, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 12885501.0, "reward": 0.09036614745855331, "reward_std": 0.031877610832452774, "rewards/bleu_reward_func/mean": 0.09036614745855331, "rewards/bleu_reward_func/std": 0.05137631297111511, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 280.4444580078125, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.7896, "grad_norm": 2.421383857727051, "kl": 0.046905517578125, "learning_rate": 1e-06, "loss": 0.1548, "num_tokens": 12899517.0, "reward": 0.06479343771934509, "reward_std": 0.01870723068714142, "rewards/bleu_reward_func/mean": 0.06479343771934509, "rewards/bleu_reward_func/std": 0.039773859083652496, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 238.07144165039062, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.7904, "grad_norm": 3.447153329849243, "kl": 0.06671142578125, "learning_rate": 1e-06, "loss": -0.0649, "num_tokens": 12910175.0, "reward": 0.0866774171590805, "reward_std": 0.07288840413093567, "rewards/bleu_reward_func/mean": 0.0866774171590805, "rewards/bleu_reward_func/std": 0.10417941212654114, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 74.61538696289062, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7912, "grad_norm": 11.11733341217041, "kl": 0.34686279296875, "learning_rate": 1e-06, "loss": 0.4111, "num_tokens": 12920075.0, "reward": 0.18872088193893433, "reward_std": 0.05310884118080139, "rewards/bleu_reward_func/mean": 0.18872088193893433, "rewards/bleu_reward_func/std": 0.10052233934402466, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 251.15625, "completions/mean_terminated_length": 94.6500015258789, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.792, "grad_norm": 4.343538761138916, "kl": 0.0811767578125, "learning_rate": 1e-06, "loss": 0.255, "num_tokens": 12932672.0, "reward": 0.2421235740184784, "reward_std": 0.03650471195578575, "rewards/bleu_reward_func/mean": 0.2421235740184784, "rewards/bleu_reward_func/std": 0.35968947410583496, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 365.34375, "completions/mean_terminated_length": 307.9565124511719, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.7928, "grad_norm": 4.527655601501465, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": -0.0782, "num_tokens": 12947603.0, "reward": 0.05564543977379799, "reward_std": 0.033515315502882004, "rewards/bleu_reward_func/mean": 0.05564543977379799, "rewards/bleu_reward_func/std": 0.03913462907075882, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 128.46875, "completions/mean_terminated_length": 57.4444465637207, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.7936, "grad_norm": 9.973651885986328, "kl": 0.1922607421875, "learning_rate": 1e-06, "loss": 0.9373, "num_tokens": 12958538.0, "reward": 0.3186902403831482, "reward_std": 0.10882419347763062, "rewards/bleu_reward_func/mean": 0.3186902403831482, "rewards/bleu_reward_func/std": 0.2608534097671509, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 328.375, "completions/mean_terminated_length": 202.73684692382812, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7944, "grad_norm": 7.580811023712158, "kl": 0.290863037109375, "learning_rate": 1e-06, "loss": 0.1547, "num_tokens": 12975334.0, "reward": 0.16742870211601257, "reward_std": 0.03473435714840889, "rewards/bleu_reward_func/mean": 0.16742870211601257, "rewards/bleu_reward_func/std": 0.1612749844789505, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 47.03703689575195, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.7952, "grad_norm": 20.384323120117188, "kl": 0.74737548828125, "learning_rate": 1e-06, "loss": -0.0224, "num_tokens": 12988020.0, "reward": 0.1509585976600647, "reward_std": 0.0387745276093483, "rewards/bleu_reward_func/mean": 0.1509585976600647, "rewards/bleu_reward_func/std": 0.13122804462909698, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 346.84375, "completions/mean_terminated_length": 181.6875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.796, "grad_norm": 4.161476135253906, "kl": 0.05682373046875, "learning_rate": 1e-06, "loss": -0.1853, "num_tokens": 13003199.0, "reward": 0.026108039543032646, "reward_std": 0.02537854015827179, "rewards/bleu_reward_func/mean": 0.026108039543032646, "rewards/bleu_reward_func/std": 0.03443064168095589, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 181.0625, "completions/mean_terminated_length": 70.75, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.7968, "grad_norm": 14.441998481750488, "kl": 0.575714111328125, "learning_rate": 1e-06, "loss": 0.1517, "num_tokens": 13012889.0, "reward": 0.10783781111240387, "reward_std": 0.053533561527729034, "rewards/bleu_reward_func/mean": 0.10783781111240387, "rewards/bleu_reward_func/std": 0.09023794531822205, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 371.75, "completions/mean_terminated_length": 166.7692413330078, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.7976, "grad_norm": 3.627946615219116, "kl": 0.089813232421875, "learning_rate": 1e-06, "loss": -0.0463, "num_tokens": 13027321.0, "reward": 0.07810983061790466, "reward_std": 0.03820539265871048, "rewards/bleu_reward_func/mean": 0.07810983061790466, "rewards/bleu_reward_func/std": 0.07255319505929947, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 299.96875, "completions/mean_terminated_length": 203.59091186523438, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.7984, "grad_norm": 5.938675403594971, "kl": 0.163330078125, "learning_rate": 1e-06, "loss": 0.1322, "num_tokens": 13038888.0, "reward": 0.0998745784163475, "reward_std": 0.12165166437625885, "rewards/bleu_reward_func/mean": 0.0998745784163475, "rewards/bleu_reward_func/std": 0.2023635059595108, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 299.25, "completions/mean_terminated_length": 171.60000610351562, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.7992, "grad_norm": 5.83504581451416, "kl": 0.288848876953125, "learning_rate": 1e-06, "loss": 0.0509, "num_tokens": 13051904.0, "reward": 0.10337799787521362, "reward_std": 0.029087794944643974, "rewards/bleu_reward_func/mean": 0.10337799787521362, "rewards/bleu_reward_func/std": 0.07911896705627441, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 345.96875, "completions/mean_terminated_length": 322.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.8, "grad_norm": 2.916576385498047, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.05, "num_tokens": 13065431.0, "reward": 0.05868455022573471, "reward_std": 0.017369702458381653, "rewards/bleu_reward_func/mean": 0.05868455022573471, "rewards/bleu_reward_func/std": 0.04672805219888687, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 266.8125, "completions/mean_terminated_length": 250.4666748046875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.8008, "grad_norm": 5.116244316101074, "kl": 0.20050048828125, "learning_rate": 1e-06, "loss": 0.0918, "num_tokens": 13079225.0, "reward": 0.13771796226501465, "reward_std": 0.04302237555384636, "rewards/bleu_reward_func/mean": 0.13771796226501465, "rewards/bleu_reward_func/std": 0.10432249307632446, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 316.1875, "completions/mean_terminated_length": 239.56521606445312, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8016, "grad_norm": 3.050908088684082, "kl": 0.0596923828125, "learning_rate": 1e-06, "loss": -0.0323, "num_tokens": 13093775.0, "reward": 0.07833529263734818, "reward_std": 0.02821630984544754, "rewards/bleu_reward_func/mean": 0.07833529263734818, "rewards/bleu_reward_func/std": 0.06890382617712021, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 132.57144165039062, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.8024, "grad_norm": 6.102024078369141, "kl": 0.28985595703125, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 13104447.0, "reward": 0.27597150206565857, "reward_std": 0.04547630250453949, "rewards/bleu_reward_func/mean": 0.27597150206565857, "rewards/bleu_reward_func/std": 0.2288428395986557, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 391.25, "completions/mean_terminated_length": 254.40000915527344, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8032, "grad_norm": 3.2652981281280518, "kl": 0.084625244140625, "learning_rate": 1e-06, "loss": 0.1886, "num_tokens": 13119655.0, "reward": 0.13283666968345642, "reward_std": 0.029899559915065765, "rewards/bleu_reward_func/mean": 0.13283666968345642, "rewards/bleu_reward_func/std": 0.1536840796470642, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 203.8125, "completions/mean_terminated_length": 183.2666778564453, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.804, "grad_norm": 6.098532199859619, "kl": 0.1204833984375, "learning_rate": 1e-06, "loss": 0.0735, "num_tokens": 13128825.0, "reward": 0.21581590175628662, "reward_std": 0.10097475349903107, "rewards/bleu_reward_func/mean": 0.21581590175628662, "rewards/bleu_reward_func/std": 0.2611050307750702, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 358.1875, "completions/mean_terminated_length": 238.55555725097656, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8048, "grad_norm": 7.441494464874268, "kl": 0.191802978515625, "learning_rate": 1e-06, "loss": 0.0313, "num_tokens": 13142175.0, "reward": 0.20881031453609467, "reward_std": 0.03906787186861038, "rewards/bleu_reward_func/mean": 0.20881031453609467, "rewards/bleu_reward_func/std": 0.17651565372943878, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 307.65625, "completions/mean_terminated_length": 260.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.8056, "grad_norm": 3.559936761856079, "kl": 0.0572509765625, "learning_rate": 1e-06, "loss": 0.1028, "num_tokens": 13153852.0, "reward": 0.04283101111650467, "reward_std": 0.04057364910840988, "rewards/bleu_reward_func/mean": 0.04283101111650467, "rewards/bleu_reward_func/std": 0.059128936380147934, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 336.875, "completions/mean_terminated_length": 245.1428680419922, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.8064, "grad_norm": 4.735692024230957, "kl": 0.069091796875, "learning_rate": 1e-06, "loss": 0.0972, "num_tokens": 13171048.0, "reward": 0.09638670086860657, "reward_std": 0.019614677876234055, "rewards/bleu_reward_func/mean": 0.09638670086860657, "rewards/bleu_reward_func/std": 0.09023009240627289, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 329.59375, "completions/mean_terminated_length": 204.7894744873047, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.8072, "grad_norm": 7.47179651260376, "kl": 0.1318359375, "learning_rate": 1e-06, "loss": -0.0877, "num_tokens": 13187195.0, "reward": 0.036128196865320206, "reward_std": 0.020984536036849022, "rewards/bleu_reward_func/mean": 0.036128196865320206, "rewards/bleu_reward_func/std": 0.038413140922784805, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 53.5625, "completions/mean_terminated_length": 53.5625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.808, "grad_norm": 10.647790908813477, "kl": 0.482666015625, "learning_rate": 1e-06, "loss": -0.052, "num_tokens": 13197629.0, "reward": 0.34467193484306335, "reward_std": 0.09173881262540817, "rewards/bleu_reward_func/mean": 0.34467193484306335, "rewards/bleu_reward_func/std": 0.23519103229045868, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 191.6875, "completions/mean_terminated_length": 181.35482788085938, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8088, "grad_norm": 6.100401878356934, "kl": 0.141632080078125, "learning_rate": 1e-06, "loss": 0.0525, "num_tokens": 13210699.0, "reward": 0.23702389001846313, "reward_std": 0.03205852955579758, "rewards/bleu_reward_func/mean": 0.23702389001846313, "rewards/bleu_reward_func/std": 0.1315021812915802, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 374.4375, "completions/mean_terminated_length": 218.53334045410156, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.8096, "grad_norm": 8.772051811218262, "kl": 0.103546142578125, "learning_rate": 1e-06, "loss": 0.0866, "num_tokens": 13228945.0, "reward": 0.13881856203079224, "reward_std": 0.044034168124198914, "rewards/bleu_reward_func/mean": 0.13881856203079224, "rewards/bleu_reward_func/std": 0.0626484826207161, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 147.96875, "completions/mean_terminated_length": 95.96428680419922, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8104, "grad_norm": 14.525429725646973, "kl": 0.4837646484375, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 13242400.0, "reward": 0.20639193058013916, "reward_std": 0.05956702679395676, "rewards/bleu_reward_func/mean": 0.20639193058013916, "rewards/bleu_reward_func/std": 0.12604379653930664, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 149.90625, "completions/mean_terminated_length": 149.90625, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8112, "grad_norm": 22.3626766204834, "kl": 0.2891845703125, "learning_rate": 1e-06, "loss": 0.0612, "num_tokens": 13252157.0, "reward": 0.24893034994602203, "reward_std": 0.03380701690912247, "rewards/bleu_reward_func/mean": 0.24893034994602203, "rewards/bleu_reward_func/std": 0.20013003051280975, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 28.0, "completions/mean_length": 266.6875, "completions/mean_terminated_length": 21.375, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.812, "grad_norm": 7.276104927062988, "kl": 0.26300048828125, "learning_rate": 1e-06, "loss": -0.0225, "num_tokens": 13263955.0, "reward": 0.14417850971221924, "reward_std": 0.037887826561927795, "rewards/bleu_reward_func/mean": 0.14417850971221924, "rewards/bleu_reward_func/std": 0.1605955958366394, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 226.28125, "completions/mean_terminated_length": 146.27999877929688, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.8128, "grad_norm": 5.462210655212402, "kl": 0.13653564453125, "learning_rate": 1e-06, "loss": 0.1195, "num_tokens": 13273692.0, "reward": 0.08337672054767609, "reward_std": 0.02062853053212166, "rewards/bleu_reward_func/mean": 0.08337672054767609, "rewards/bleu_reward_func/std": 0.048102062195539474, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 354.0625, "completions/mean_terminated_length": 271.3333435058594, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.8136, "grad_norm": 5.125792980194092, "kl": 0.22271728515625, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 13289478.0, "reward": 0.19775965809822083, "reward_std": 0.04682963341474533, "rewards/bleu_reward_func/mean": 0.19775965809822083, "rewards/bleu_reward_func/std": 0.22582097351551056, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 183.65625, "completions/mean_terminated_length": 107.8846206665039, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.8144, "grad_norm": 7.784894943237305, "kl": 0.38018798828125, "learning_rate": 1e-06, "loss": -0.2707, "num_tokens": 13299747.0, "reward": 0.06794524192810059, "reward_std": 0.039994340389966965, "rewards/bleu_reward_func/mean": 0.06794524192810059, "rewards/bleu_reward_func/std": 0.0657891035079956, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 232.6875, "completions/mean_terminated_length": 203.79310607910156, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.8152, "grad_norm": 6.757852077484131, "kl": 0.23583984375, "learning_rate": 1e-06, "loss": -0.0289, "num_tokens": 13308865.0, "reward": 0.1327855885028839, "reward_std": 0.043607283383607864, "rewards/bleu_reward_func/mean": 0.1327855885028839, "rewards/bleu_reward_func/std": 0.1713796854019165, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 116.03125, "completions/mean_terminated_length": 75.06896209716797, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.816, "grad_norm": 6.1786370277404785, "kl": 0.21380615234375, "learning_rate": 1e-06, "loss": 0.1686, "num_tokens": 13318914.0, "reward": 0.23704446852207184, "reward_std": 0.057613980025053024, "rewards/bleu_reward_func/mean": 0.23704446852207184, "rewards/bleu_reward_func/std": 0.21550458669662476, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 188.34375, "completions/mean_terminated_length": 166.7666778564453, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8168, "grad_norm": 7.113982200622559, "kl": 0.258514404296875, "learning_rate": 1e-06, "loss": 0.0406, "num_tokens": 13329205.0, "reward": 0.08146457374095917, "reward_std": 0.02083425223827362, "rewards/bleu_reward_func/mean": 0.08146457374095917, "rewards/bleu_reward_func/std": 0.0736912190914154, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 248.1875, "completions/mean_terminated_length": 144.95652770996094, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.8176, "grad_norm": 10.716026306152344, "kl": 0.3465576171875, "learning_rate": 1e-06, "loss": -0.0356, "num_tokens": 13345867.0, "reward": 0.1497185379266739, "reward_std": 0.016201931983232498, "rewards/bleu_reward_func/mean": 0.1497185379266739, "rewards/bleu_reward_func/std": 0.17363472282886505, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 306.59375, "completions/mean_terminated_length": 238.125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.8184, "grad_norm": 5.428062915802002, "kl": 0.152679443359375, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 13360854.0, "reward": 0.1825982928276062, "reward_std": 0.057225678116083145, "rewards/bleu_reward_func/mean": 0.1825982928276062, "rewards/bleu_reward_func/std": 0.1867101639509201, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 103.6875, "completions/mean_terminated_length": 103.6875, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8192, "grad_norm": 9.348381042480469, "kl": 0.17828369140625, "learning_rate": 1e-06, "loss": 0.0549, "num_tokens": 13369980.0, "reward": 0.11240973323583603, "reward_std": 0.026126563549041748, "rewards/bleu_reward_func/mean": 0.11240973323583603, "rewards/bleu_reward_func/std": 0.11270570009946823, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 225.875, "completions/mean_terminated_length": 159.84616088867188, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.82, "grad_norm": 7.571503162384033, "kl": 0.237548828125, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 13384584.0, "reward": 0.2324497401714325, "reward_std": 0.0470973402261734, "rewards/bleu_reward_func/mean": 0.2324497401714325, "rewards/bleu_reward_func/std": 0.1243894025683403, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 139.34375, "completions/mean_terminated_length": 114.50000762939453, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.8208, "grad_norm": 8.227386474609375, "kl": 0.25714111328125, "learning_rate": 1e-06, "loss": 0.1704, "num_tokens": 13391771.0, "reward": 0.12480157613754272, "reward_std": 0.04330623894929886, "rewards/bleu_reward_func/mean": 0.12480157613754272, "rewards/bleu_reward_func/std": 0.103439562022686, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 214.3125, "completions/mean_terminated_length": 130.95999145507812, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8216, "grad_norm": 5.542022228240967, "kl": 0.24603271484375, "learning_rate": 1e-06, "loss": -0.1391, "num_tokens": 13403757.0, "reward": 0.25766974687576294, "reward_std": 0.03755660355091095, "rewards/bleu_reward_func/mean": 0.25766974687576294, "rewards/bleu_reward_func/std": 0.22421182692050934, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 459.53125, "completions/mean_terminated_length": 413.23529052734375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.8224, "grad_norm": 2.1804089546203613, "kl": 0.04327392578125, "learning_rate": 1e-06, "loss": -0.1423, "num_tokens": 13421078.0, "reward": 0.07269357144832611, "reward_std": 0.02826325222849846, "rewards/bleu_reward_func/mean": 0.07269357144832611, "rewards/bleu_reward_func/std": 0.034365471452474594, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 233.90625, "completions/mean_terminated_length": 194.17857360839844, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8232, "grad_norm": 5.306719779968262, "kl": 0.1806640625, "learning_rate": 1e-06, "loss": -0.4029, "num_tokens": 13432323.0, "reward": 0.18320006132125854, "reward_std": 0.08323986828327179, "rewards/bleu_reward_func/mean": 0.18320006132125854, "rewards/bleu_reward_func/std": 0.2284490317106247, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 258.4375, "completions/mean_terminated_length": 187.44000244140625, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.824, "grad_norm": 3.841357707977295, "kl": 0.070404052734375, "learning_rate": 1e-06, "loss": 0.1459, "num_tokens": 13444169.0, "reward": 0.09387044608592987, "reward_std": 0.07637906074523926, "rewards/bleu_reward_func/mean": 0.09387044608592987, "rewards/bleu_reward_func/std": 0.10294011980295181, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 121.59375, "completions/mean_terminated_length": 109.0, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.8248, "grad_norm": 10.732538223266602, "kl": 0.598388671875, "learning_rate": 1e-06, "loss": -0.0236, "num_tokens": 13450364.0, "reward": 0.18597961962223053, "reward_std": 0.03610639274120331, "rewards/bleu_reward_func/mean": 0.18597961962223053, "rewards/bleu_reward_func/std": 0.14241203665733337, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 174.15625, "completions/mean_terminated_length": 151.6333465576172, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.8256, "grad_norm": 5.849789619445801, "kl": 0.2237548828125, "learning_rate": 1e-06, "loss": -0.1461, "num_tokens": 13458489.0, "reward": 0.10662397742271423, "reward_std": 0.044935449957847595, "rewards/bleu_reward_func/mean": 0.10662397742271423, "rewards/bleu_reward_func/std": 0.08882930874824524, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 365.75, "completions/mean_terminated_length": 278.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.8264, "grad_norm": 2.3378233909606934, "kl": 0.040679931640625, "learning_rate": 1e-06, "loss": 0.018, "num_tokens": 13477497.0, "reward": 0.12139745056629181, "reward_std": 0.030839571729302406, "rewards/bleu_reward_func/mean": 0.12139745056629181, "rewards/bleu_reward_func/std": 0.087521493434906, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 307.8125, "completions/mean_terminated_length": 260.69232177734375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8272, "grad_norm": 5.579449653625488, "kl": 0.14544677734375, "learning_rate": 1e-06, "loss": -0.0832, "num_tokens": 13491595.0, "reward": 0.06439976394176483, "reward_std": 0.01632755994796753, "rewards/bleu_reward_func/mean": 0.06439976394176483, "rewards/bleu_reward_func/std": 0.025089839473366737, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 227.90625, "completions/mean_terminated_length": 162.34616088867188, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.828, "grad_norm": 6.95845890045166, "kl": 0.08489990234375, "learning_rate": 1e-06, "loss": 0.1636, "num_tokens": 13502304.0, "reward": 0.15672987699508667, "reward_std": 0.07095484435558319, "rewards/bleu_reward_func/mean": 0.15672987699508667, "rewards/bleu_reward_func/std": 0.1326054334640503, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 162.59375, "completions/mean_terminated_length": 139.3000030517578, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8288, "grad_norm": 7.602659702301025, "kl": 0.2142333984375, "learning_rate": 1e-06, "loss": -0.0952, "num_tokens": 13512379.0, "reward": 0.10166750848293304, "reward_std": 0.022390395402908325, "rewards/bleu_reward_func/mean": 0.10166750848293304, "rewards/bleu_reward_func/std": 0.09791414439678192, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 352.84375, "completions/mean_terminated_length": 257.3500061035156, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.8296, "grad_norm": 3.4908225536346436, "kl": 0.0816650390625, "learning_rate": 1e-06, "loss": 0.1513, "num_tokens": 13526246.0, "reward": 0.10761404037475586, "reward_std": 0.02660614624619484, "rewards/bleu_reward_func/mean": 0.10761404037475586, "rewards/bleu_reward_func/std": 0.08269859850406647, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 154.03125, "completions/mean_terminated_length": 87.74073791503906, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8304, "grad_norm": 8.316597938537598, "kl": 0.3048095703125, "learning_rate": 1e-06, "loss": 0.108, "num_tokens": 13538935.0, "reward": 0.14819365739822388, "reward_std": 0.07058853656053543, "rewards/bleu_reward_func/mean": 0.14819365739822388, "rewards/bleu_reward_func/std": 0.1550559103488922, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 209.37930297851562, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8312, "grad_norm": 4.724348545074463, "kl": 0.1529541015625, "learning_rate": 1e-06, "loss": 0.0635, "num_tokens": 13551415.0, "reward": 0.16776956617832184, "reward_std": 0.026334762573242188, "rewards/bleu_reward_func/mean": 0.16776956617832184, "rewards/bleu_reward_func/std": 0.18577900528907776, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 93.375, "completions/mean_terminated_length": 93.375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.832, "grad_norm": 8.702837944030762, "kl": 0.33355712890625, "learning_rate": 1e-06, "loss": 0.0333, "num_tokens": 13560747.0, "reward": 0.1746351718902588, "reward_std": 0.039413660764694214, "rewards/bleu_reward_func/mean": 0.1746351718902588, "rewards/bleu_reward_func/std": 0.13439369201660156, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 177.90625, "completions/mean_terminated_length": 66.54167175292969, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.8328, "grad_norm": 14.271418571472168, "kl": 0.298431396484375, "learning_rate": 1e-06, "loss": -0.356, "num_tokens": 13572328.0, "reward": 0.0881040021777153, "reward_std": 0.0392255075275898, "rewards/bleu_reward_func/mean": 0.0881040021777153, "rewards/bleu_reward_func/std": 0.086721271276474, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 243.34375, "completions/mean_terminated_length": 138.21739196777344, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.8336, "grad_norm": 3.9084980487823486, "kl": 0.07525634765625, "learning_rate": 1e-06, "loss": 0.0813, "num_tokens": 13586859.0, "reward": 0.06463417410850525, "reward_std": 0.022750139236450195, "rewards/bleu_reward_func/mean": 0.06463417410850525, "rewards/bleu_reward_func/std": 0.05624645948410034, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 328.96875, "completions/mean_terminated_length": 267.9583435058594, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8344, "grad_norm": 9.608210563659668, "kl": 0.2337646484375, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 13600978.0, "reward": 0.06980200856924057, "reward_std": 0.015845034271478653, "rewards/bleu_reward_func/mean": 0.06980200856924057, "rewards/bleu_reward_func/std": 0.03303433954715729, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 305.8125, "completions/mean_terminated_length": 145.44444274902344, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8352, "grad_norm": 4.128615379333496, "kl": 0.06011962890625, "learning_rate": 1e-06, "loss": 0.1566, "num_tokens": 13613068.0, "reward": 0.04747869074344635, "reward_std": 0.013655820861458778, "rewards/bleu_reward_func/mean": 0.04747869074344635, "rewards/bleu_reward_func/std": 0.028707411140203476, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 426.53125, "completions/mean_terminated_length": 368.0526428222656, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.836, "grad_norm": 2.515371799468994, "kl": 0.043975830078125, "learning_rate": 1e-06, "loss": -0.065, "num_tokens": 13629173.0, "reward": 0.028738608583807945, "reward_std": 0.012511001899838448, "rewards/bleu_reward_func/mean": 0.028738608583807945, "rewards/bleu_reward_func/std": 0.014564147219061852, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 206.03125, "completions/mean_terminated_length": 135.42308044433594, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8368, "grad_norm": 8.021496772766113, "kl": 0.273529052734375, "learning_rate": 1e-06, "loss": 0.1291, "num_tokens": 13637374.0, "reward": 0.11877701431512833, "reward_std": 0.04857534170150757, "rewards/bleu_reward_func/mean": 0.11877701431512833, "rewards/bleu_reward_func/std": 0.08409105986356735, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 251.65625, "completions/mean_terminated_length": 214.46429443359375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.8376, "grad_norm": 3.2490315437316895, "kl": 0.06829833984375, "learning_rate": 1e-06, "loss": -0.1358, "num_tokens": 13648067.0, "reward": 0.08158313482999802, "reward_std": 0.02561478689312935, "rewards/bleu_reward_func/mean": 0.08158313482999802, "rewards/bleu_reward_func/std": 0.05671805888414383, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 269.125, "completions/mean_terminated_length": 201.1199951171875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8384, "grad_norm": 5.032691955566406, "kl": 0.1785888671875, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 13660719.0, "reward": 0.18114086985588074, "reward_std": 0.03815930336713791, "rewards/bleu_reward_func/mean": 0.18114086985588074, "rewards/bleu_reward_func/std": 0.14998804032802582, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 87.25, "completions/mean_terminated_length": 73.54838562011719, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.8392, "grad_norm": 8.731785774230957, "kl": 0.384765625, "learning_rate": 1e-06, "loss": 0.0594, "num_tokens": 13670767.0, "reward": 0.23021195828914642, "reward_std": 0.09217022359371185, "rewards/bleu_reward_func/mean": 0.23021195828914642, "rewards/bleu_reward_func/std": 0.18223723769187927, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 80.44444274902344, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.84, "grad_norm": 8.531782150268555, "kl": 0.29345703125, "learning_rate": 1e-06, "loss": 0.2277, "num_tokens": 13685927.0, "reward": 0.13349372148513794, "reward_std": 0.053998030722141266, "rewards/bleu_reward_func/mean": 0.13349372148513794, "rewards/bleu_reward_func/std": 0.15102460980415344, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 375.4375, "completions/mean_terminated_length": 238.875, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.8408, "grad_norm": 7.057468414306641, "kl": 0.099639892578125, "learning_rate": 1e-06, "loss": 0.1437, "num_tokens": 13702781.0, "reward": 0.02950356900691986, "reward_std": 0.010849589481949806, "rewards/bleu_reward_func/mean": 0.02950356900691986, "rewards/bleu_reward_func/std": 0.02092377282679081, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 237.84375, "completions/mean_terminated_length": 187.07408142089844, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8416, "grad_norm": 8.259405136108398, "kl": 0.57330322265625, "learning_rate": 1e-06, "loss": 0.1046, "num_tokens": 13713560.0, "reward": 0.1828644871711731, "reward_std": 0.04976918175816536, "rewards/bleu_reward_func/mean": 0.1828644871711731, "rewards/bleu_reward_func/std": 0.13261918723583221, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 154.09375, "completions/mean_terminated_length": 87.81481170654297, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.8424, "grad_norm": 9.288348197937012, "kl": 0.424560546875, "learning_rate": 1e-06, "loss": -0.0227, "num_tokens": 13720387.0, "reward": 0.1263371855020523, "reward_std": 0.031269170343875885, "rewards/bleu_reward_func/mean": 0.1263371855020523, "rewards/bleu_reward_func/std": 0.1025131419301033, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 94.0, "completions/mean_length": 160.15625, "completions/mean_terminated_length": 42.875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8432, "grad_norm": 7.418272972106934, "kl": 0.32086181640625, "learning_rate": 1e-06, "loss": -0.0571, "num_tokens": 13731528.0, "reward": 0.2602638304233551, "reward_std": 0.07646072655916214, "rewards/bleu_reward_func/mean": 0.2602638304233551, "rewards/bleu_reward_func/std": 0.2308470755815506, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 183.46875, "completions/mean_terminated_length": 122.62963104248047, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.844, "grad_norm": 8.014601707458496, "kl": 0.3927001953125, "learning_rate": 1e-06, "loss": 0.1042, "num_tokens": 13742447.0, "reward": 0.07197493314743042, "reward_std": 0.012622429989278316, "rewards/bleu_reward_func/mean": 0.07197493314743042, "rewards/bleu_reward_func/std": 0.04327724501490593, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 240.59375, "completions/mean_terminated_length": 222.50001525878906, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8448, "grad_norm": 5.102974891662598, "kl": 0.1640625, "learning_rate": 1e-06, "loss": -0.0185, "num_tokens": 13755202.0, "reward": 0.08292236924171448, "reward_std": 0.023967744782567024, "rewards/bleu_reward_func/mean": 0.08292236924171448, "rewards/bleu_reward_func/std": 0.046691060066223145, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 133.0625, "completions/mean_terminated_length": 45.615386962890625, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.8456, "grad_norm": 12.413713455200195, "kl": 0.4334716796875, "learning_rate": 1e-06, "loss": 0.1467, "num_tokens": 13763268.0, "reward": 0.09796961396932602, "reward_std": 0.02291642501950264, "rewards/bleu_reward_func/mean": 0.09796961396932602, "rewards/bleu_reward_func/std": 0.04426925256848335, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 155.71875, "completions/mean_terminated_length": 89.74073791503906, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.8464, "grad_norm": 8.978612899780273, "kl": 0.228515625, "learning_rate": 1e-06, "loss": -0.0356, "num_tokens": 13771987.0, "reward": 0.10166356712579727, "reward_std": 0.055922288447618484, "rewards/bleu_reward_func/mean": 0.10166356712579727, "rewards/bleu_reward_func/std": 0.07498722523450851, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 408.84375, "completions/mean_terminated_length": 276.21429443359375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8472, "grad_norm": 1.850431203842163, "kl": 0.046905517578125, "learning_rate": 1e-06, "loss": 0.1476, "num_tokens": 13791446.0, "reward": 0.07564342021942139, "reward_std": 0.015303988009691238, "rewards/bleu_reward_func/mean": 0.07564342021942139, "rewards/bleu_reward_func/std": 0.09736621379852295, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 285.71875, "completions/mean_terminated_length": 197.17391967773438, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.848, "grad_norm": 4.632063388824463, "kl": 0.21221923828125, "learning_rate": 1e-06, "loss": 0.0453, "num_tokens": 13803221.0, "reward": 0.12073882669210434, "reward_std": 0.03981417790055275, "rewards/bleu_reward_func/mean": 0.12073882669210434, "rewards/bleu_reward_func/std": 0.07346338778734207, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 329.1875, "completions/mean_terminated_length": 246.09091186523438, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8488, "grad_norm": 4.053706645965576, "kl": 0.078857421875, "learning_rate": 1e-06, "loss": 0.2911, "num_tokens": 13818331.0, "reward": 0.06476722657680511, "reward_std": 0.030476348474621773, "rewards/bleu_reward_func/mean": 0.06476722657680511, "rewards/bleu_reward_func/std": 0.06862985342741013, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 156.5625, "completions/mean_terminated_length": 57.03999710083008, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8496, "grad_norm": 8.067891120910645, "kl": 0.1793212890625, "learning_rate": 1e-06, "loss": 0.195, "num_tokens": 13826957.0, "reward": 0.20933812856674194, "reward_std": 0.051397278904914856, "rewards/bleu_reward_func/mean": 0.20933812856674194, "rewards/bleu_reward_func/std": 0.2901296019554138, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 227.09375, "completions/mean_terminated_length": 115.60869598388672, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.8504, "grad_norm": 7.58860445022583, "kl": 0.165283203125, "learning_rate": 1e-06, "loss": -0.0274, "num_tokens": 13838440.0, "reward": 0.11480045318603516, "reward_std": 0.027013186365365982, "rewards/bleu_reward_func/mean": 0.11480045318603516, "rewards/bleu_reward_func/std": 0.11019645631313324, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 189.1875, "completions/mean_terminated_length": 114.69231414794922, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8512, "grad_norm": 5.411109447479248, "kl": 0.1951904296875, "learning_rate": 1e-06, "loss": 0.0807, "num_tokens": 13849406.0, "reward": 0.1512412428855896, "reward_std": 0.062053047120571136, "rewards/bleu_reward_func/mean": 0.1512412428855896, "rewards/bleu_reward_func/std": 0.12469635158777237, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 259.5625, "completions/mean_terminated_length": 127.33333587646484, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.852, "grad_norm": 6.397778034210205, "kl": 0.24896240234375, "learning_rate": 1e-06, "loss": -0.0087, "num_tokens": 13864056.0, "reward": 0.1552116870880127, "reward_std": 0.027821514755487442, "rewards/bleu_reward_func/mean": 0.1552116870880127, "rewards/bleu_reward_func/std": 0.10892557352781296, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 139.8125, "completions/mean_terminated_length": 139.8125, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.8528, "grad_norm": 8.31270694732666, "kl": 0.20196533203125, "learning_rate": 1e-06, "loss": 0.0519, "num_tokens": 13871490.0, "reward": 0.13230201601982117, "reward_std": 0.04370046779513359, "rewards/bleu_reward_func/mean": 0.13230201601982117, "rewards/bleu_reward_func/std": 0.08659063279628754, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 169.0625, "completions/mean_terminated_length": 73.04000091552734, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8536, "grad_norm": 6.236849784851074, "kl": 0.39727783203125, "learning_rate": 1e-06, "loss": 0.1366, "num_tokens": 13885556.0, "reward": 0.17620697617530823, "reward_std": 0.024748487398028374, "rewards/bleu_reward_func/mean": 0.17620697617530823, "rewards/bleu_reward_func/std": 0.10503184050321579, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 228.75, "completions/mean_terminated_length": 176.29629516601562, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8544, "grad_norm": 6.749774932861328, "kl": 0.181121826171875, "learning_rate": 1e-06, "loss": -0.1016, "num_tokens": 13894804.0, "reward": 0.17489100992679596, "reward_std": 0.042406514286994934, "rewards/bleu_reward_func/mean": 0.17489100992679596, "rewards/bleu_reward_func/std": 0.14329132437705994, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 246.6875, "completions/mean_terminated_length": 126.09091186523438, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8552, "grad_norm": 6.8404459953308105, "kl": 0.31402587890625, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 13907514.0, "reward": 0.16271845996379852, "reward_std": 0.04602063074707985, "rewards/bleu_reward_func/mean": 0.16271845996379852, "rewards/bleu_reward_func/std": 0.12579885125160217, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 269.78125, "completions/mean_terminated_length": 124.45000457763672, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.856, "grad_norm": 4.438868522644043, "kl": 0.177520751953125, "learning_rate": 1e-06, "loss": 0.0265, "num_tokens": 13919603.0, "reward": 0.13932910561561584, "reward_std": 0.01856398582458496, "rewards/bleu_reward_func/mean": 0.13932910561561584, "rewards/bleu_reward_func/std": 0.14700213074684143, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 149.9375, "completions/mean_terminated_length": 82.8888931274414, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8568, "grad_norm": 7.284147262573242, "kl": 0.3516845703125, "learning_rate": 1e-06, "loss": 0.0398, "num_tokens": 13927953.0, "reward": 0.2614789605140686, "reward_std": 0.07057315111160278, "rewards/bleu_reward_func/mean": 0.2614789605140686, "rewards/bleu_reward_func/std": 0.1882997453212738, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 243.78125, "completions/mean_terminated_length": 168.67999267578125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.8576, "grad_norm": 5.096418380737305, "kl": 0.16357421875, "learning_rate": 1e-06, "loss": -0.0371, "num_tokens": 13942570.0, "reward": 0.07684318721294403, "reward_std": 0.019258558750152588, "rewards/bleu_reward_func/mean": 0.07684318721294403, "rewards/bleu_reward_func/std": 0.03753623366355896, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 107.15789794921875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8584, "grad_norm": 3.9537734985351562, "kl": 0.081634521484375, "learning_rate": 1e-06, "loss": -0.0181, "num_tokens": 13956350.0, "reward": 0.1140797883272171, "reward_std": 0.023730140179395676, "rewards/bleu_reward_func/mean": 0.1140797883272171, "rewards/bleu_reward_func/std": 0.1426122635602951, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 323.4375, "completions/mean_terminated_length": 237.72727966308594, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8592, "grad_norm": 4.239878177642822, "kl": 0.10797119140625, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 13971620.0, "reward": 0.10953356325626373, "reward_std": 0.07727043330669403, "rewards/bleu_reward_func/mean": 0.10953356325626373, "rewards/bleu_reward_func/std": 0.1143500879406929, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 447.5625, "completions/mean_terminated_length": 383.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.86, "grad_norm": 2.2002503871917725, "kl": 0.049072265625, "learning_rate": 1e-06, "loss": -0.0472, "num_tokens": 13990150.0, "reward": 0.08343654125928879, "reward_std": 0.02118324115872383, "rewards/bleu_reward_func/mean": 0.08343654125928879, "rewards/bleu_reward_func/std": 0.08093992620706558, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 472.125, "completions/mean_terminated_length": 420.8571472167969, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.8608, "grad_norm": 2.0650570392608643, "kl": 0.032440185546875, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 14011170.0, "reward": 0.04162130132317543, "reward_std": 0.010478474199771881, "rewards/bleu_reward_func/mean": 0.04162130132317543, "rewards/bleu_reward_func/std": 0.017952080816030502, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 242.78125, "completions/mean_terminated_length": 58.578948974609375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.8616, "grad_norm": 4.682583808898926, "kl": 0.316986083984375, "learning_rate": 1e-06, "loss": -0.0201, "num_tokens": 14025299.0, "reward": 0.2864699065685272, "reward_std": 0.03820549696683884, "rewards/bleu_reward_func/mean": 0.2864699065685272, "rewards/bleu_reward_func/std": 0.26346415281295776, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 50.16666793823242, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.8624, "grad_norm": 7.724006652832031, "kl": 0.23388671875, "learning_rate": 1e-06, "loss": -0.0182, "num_tokens": 14037399.0, "reward": 0.1795015037059784, "reward_std": 0.07820923626422882, "rewards/bleu_reward_func/mean": 0.1795015037059784, "rewards/bleu_reward_func/std": 0.15337687730789185, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 276.09375, "completions/mean_terminated_length": 40.1875, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8632, "grad_norm": 8.850776672363281, "kl": 0.24285888671875, "learning_rate": 1e-06, "loss": -0.1453, "num_tokens": 14052562.0, "reward": 0.09584256261587143, "reward_std": 0.01827467978000641, "rewards/bleu_reward_func/mean": 0.09584256261587143, "rewards/bleu_reward_func/std": 0.10066576302051544, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 203.1875, "completions/mean_terminated_length": 82.34782409667969, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.864, "grad_norm": 4.735725402832031, "kl": 0.0994873046875, "learning_rate": 1e-06, "loss": 0.2218, "num_tokens": 14061568.0, "reward": 0.27030178904533386, "reward_std": 0.057654060423374176, "rewards/bleu_reward_func/mean": 0.27030178904533386, "rewards/bleu_reward_func/std": 0.16439270973205566, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 237.875, "completions/mean_terminated_length": 174.61538696289062, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8648, "grad_norm": 4.542383193969727, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": -0.1576, "num_tokens": 14071236.0, "reward": 0.05943232774734497, "reward_std": 0.03797609731554985, "rewards/bleu_reward_func/mean": 0.05943232774734497, "rewards/bleu_reward_func/std": 0.07494883239269257, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 214.625, "completions/mean_terminated_length": 146.0, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.8656, "grad_norm": 9.019255638122559, "kl": 0.340179443359375, "learning_rate": 1e-06, "loss": 0.0456, "num_tokens": 14080680.0, "reward": 0.09385368227958679, "reward_std": 0.041810497641563416, "rewards/bleu_reward_func/mean": 0.09385368227958679, "rewards/bleu_reward_func/std": 0.04523298144340515, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 111.3125, "completions/mean_terminated_length": 54.07143020629883, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8664, "grad_norm": 5.487109184265137, "kl": 0.3203125, "learning_rate": 1e-06, "loss": 0.195, "num_tokens": 14093178.0, "reward": 0.2254057228565216, "reward_std": 0.0354473814368248, "rewards/bleu_reward_func/mean": 0.2254057228565216, "rewards/bleu_reward_func/std": 0.15529486536979675, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 473.03125, "completions/mean_terminated_length": 408.0833435058594, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8672, "grad_norm": 2.1760456562042236, "kl": 0.05029296875, "learning_rate": 1e-06, "loss": -0.1127, "num_tokens": 14112195.0, "reward": 0.11573594808578491, "reward_std": 0.034562353044748306, "rewards/bleu_reward_func/mean": 0.11573594808578491, "rewards/bleu_reward_func/std": 0.0883888527750969, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 248.8125, "completions/mean_terminated_length": 110.95238494873047, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.868, "grad_norm": 20.237722396850586, "kl": 0.27996826171875, "learning_rate": 1e-06, "loss": -0.0279, "num_tokens": 14125485.0, "reward": 0.06478704512119293, "reward_std": 0.01746372878551483, "rewards/bleu_reward_func/mean": 0.06478704512119293, "rewards/bleu_reward_func/std": 0.04760226234793663, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 254.21875, "completions/mean_terminated_length": 194.73077392578125, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.8688, "grad_norm": 4.712401390075684, "kl": 0.119384765625, "learning_rate": 1e-06, "loss": 0.2418, "num_tokens": 14136628.0, "reward": 0.07501716911792755, "reward_std": 0.022581705823540688, "rewards/bleu_reward_func/mean": 0.07501716911792755, "rewards/bleu_reward_func/std": 0.045875921845436096, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 214.4375, "completions/mean_terminated_length": 145.7692413330078, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.8696, "grad_norm": 7.036587715148926, "kl": 0.40423583984375, "learning_rate": 1e-06, "loss": -0.0659, "num_tokens": 14152290.0, "reward": 0.24361515045166016, "reward_std": 0.05023983493447304, "rewards/bleu_reward_func/mean": 0.24361515045166016, "rewards/bleu_reward_func/std": 0.2318515181541443, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 310.15625, "completions/mean_terminated_length": 281.3214416503906, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.8704, "grad_norm": 2.8048856258392334, "kl": 0.06768798828125, "learning_rate": 1e-06, "loss": -0.1012, "num_tokens": 14164015.0, "reward": 0.06632909178733826, "reward_std": 0.022660713642835617, "rewards/bleu_reward_func/mean": 0.06632909178733826, "rewards/bleu_reward_func/std": 0.05323861911892891, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 72.03125, "completions/mean_terminated_length": 57.838706970214844, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.8712, "grad_norm": 9.539746284484863, "kl": 0.28515625, "learning_rate": 1e-06, "loss": 0.2369, "num_tokens": 14175360.0, "reward": 0.2565135359764099, "reward_std": 0.06622748076915741, "rewards/bleu_reward_func/mean": 0.2565135359764099, "rewards/bleu_reward_func/std": 0.2024916261434555, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 253.8125, "completions/mean_terminated_length": 136.4545440673828, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.872, "grad_norm": 8.92574691772461, "kl": 0.2467041015625, "learning_rate": 1e-06, "loss": 0.2116, "num_tokens": 14187954.0, "reward": 0.10993756353855133, "reward_std": 0.041833557188510895, "rewards/bleu_reward_func/mean": 0.10993756353855133, "rewards/bleu_reward_func/std": 0.13020949065685272, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 193.625, "completions/mean_terminated_length": 104.47999572753906, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.8728, "grad_norm": 5.538515090942383, "kl": 0.0902099609375, "learning_rate": 1e-06, "loss": -0.1573, "num_tokens": 14196270.0, "reward": 0.05786508321762085, "reward_std": 0.02371850796043873, "rewards/bleu_reward_func/mean": 0.05786508321762085, "rewards/bleu_reward_func/std": 0.03462414816021919, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 223.09375, "completions/mean_terminated_length": 156.42308044433594, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.8736, "grad_norm": 5.6724443435668945, "kl": 0.179168701171875, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 14208633.0, "reward": 0.107764333486557, "reward_std": 0.021315133199095726, "rewards/bleu_reward_func/mean": 0.107764333486557, "rewards/bleu_reward_func/std": 0.038448914885520935, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 166.78125, "completions/mean_terminated_length": 143.7666778564453, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.8744, "grad_norm": 5.278230667114258, "kl": 0.2412109375, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 14218242.0, "reward": 0.4157559275627136, "reward_std": 0.037054967135190964, "rewards/bleu_reward_func/mean": 0.4157559275627136, "rewards/bleu_reward_func/std": 0.2559570372104645, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 313.09375, "completions/mean_terminated_length": 114.1875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8752, "grad_norm": 3.892812490463257, "kl": 0.13092041015625, "learning_rate": 1e-06, "loss": 0.1517, "num_tokens": 14232805.0, "reward": 0.12693487107753754, "reward_std": 0.04035983234643936, "rewards/bleu_reward_func/mean": 0.12693487107753754, "rewards/bleu_reward_func/std": 0.12727496027946472, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 205.1875, "completions/mean_terminated_length": 161.35714721679688, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.876, "grad_norm": 8.024927139282227, "kl": 0.350433349609375, "learning_rate": 1e-06, "loss": 0.2352, "num_tokens": 14244755.0, "reward": 0.20502734184265137, "reward_std": 0.09303287416696548, "rewards/bleu_reward_func/mean": 0.20502734184265137, "rewards/bleu_reward_func/std": 0.241799458861351, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 361.1875, "completions/mean_terminated_length": 282.19049072265625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8768, "grad_norm": 2.944213390350342, "kl": 0.068603515625, "learning_rate": 1e-06, "loss": -0.106, "num_tokens": 14257801.0, "reward": 0.04917216673493385, "reward_std": 0.011258791200816631, "rewards/bleu_reward_func/mean": 0.04917216673493385, "rewards/bleu_reward_func/std": 0.038949303328990936, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 234.21875, "completions/mean_terminated_length": 67.55000305175781, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8776, "grad_norm": 6.089639663696289, "kl": 0.22784423828125, "learning_rate": 1e-06, "loss": 0.1116, "num_tokens": 14267648.0, "reward": 0.10858422517776489, "reward_std": 0.04780227690935135, "rewards/bleu_reward_func/mean": 0.10858422517776489, "rewards/bleu_reward_func/std": 0.07767506688833237, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 383.21875, "completions/mean_terminated_length": 217.6428680419922, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8784, "grad_norm": 2.4417130947113037, "kl": 0.039581298828125, "learning_rate": 1e-06, "loss": 0.101, "num_tokens": 14284503.0, "reward": 0.1223280131816864, "reward_std": 0.058498185127973557, "rewards/bleu_reward_func/mean": 0.1223280131816864, "rewards/bleu_reward_func/std": 0.08976288139820099, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 78.60869598388672, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8792, "grad_norm": 6.311417102813721, "kl": 0.34259033203125, "learning_rate": 1e-06, "loss": 0.0608, "num_tokens": 14294455.0, "reward": 0.12745052576065063, "reward_std": 0.048099152743816376, "rewards/bleu_reward_func/mean": 0.12745052576065063, "rewards/bleu_reward_func/std": 0.12118762731552124, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 244.1538543701172, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.88, "grad_norm": 5.328085422515869, "kl": 0.184539794921875, "learning_rate": 1e-06, "loss": -0.0179, "num_tokens": 14311835.0, "reward": 0.2192595899105072, "reward_std": 0.042960211634635925, "rewards/bleu_reward_func/mean": 0.2192595899105072, "rewards/bleu_reward_func/std": 0.13524703681468964, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 324.625, "completions/mean_terminated_length": 262.16668701171875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.8808, "grad_norm": 2.6405608654022217, "kl": 0.0440673828125, "learning_rate": 1e-06, "loss": 0.1205, "num_tokens": 14327975.0, "reward": 0.14465495944023132, "reward_std": 0.04526882618665695, "rewards/bleu_reward_func/mean": 0.14465495944023132, "rewards/bleu_reward_func/std": 0.1039966493844986, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 336.125, "completions/mean_terminated_length": 180.94117736816406, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.8816, "grad_norm": 3.4212841987609863, "kl": 0.0875244140625, "learning_rate": 1e-06, "loss": 0.1697, "num_tokens": 14343595.0, "reward": 0.07303881645202637, "reward_std": 0.023782189935445786, "rewards/bleu_reward_func/mean": 0.07303881645202637, "rewards/bleu_reward_func/std": 0.10003393888473511, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 226.0, "completions/mean_terminated_length": 145.9199981689453, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.8824, "grad_norm": 7.256896018981934, "kl": 0.2523193359375, "learning_rate": 1e-06, "loss": 0.0389, "num_tokens": 14357595.0, "reward": 0.14129553735256195, "reward_std": 0.05766978859901428, "rewards/bleu_reward_func/mean": 0.14129553735256195, "rewards/bleu_reward_func/std": 0.13982893526554108, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 375.65625, "completions/mean_terminated_length": 313.68182373046875, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.8832, "grad_norm": 2.429725170135498, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0492, "num_tokens": 14372368.0, "reward": 0.0918026864528656, "reward_std": 0.019557196646928787, "rewards/bleu_reward_func/mean": 0.0918026864528656, "rewards/bleu_reward_func/std": 0.0768144428730011, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 81.0625, "completions/mean_terminated_length": 36.482757568359375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.884, "grad_norm": 10.26905345916748, "kl": 0.4246826171875, "learning_rate": 1e-06, "loss": 0.4653, "num_tokens": 14379298.0, "reward": 0.1903451383113861, "reward_std": 0.0727916806936264, "rewards/bleu_reward_func/mean": 0.1903451383113861, "rewards/bleu_reward_func/std": 0.18681129813194275, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 218.09375, "completions/mean_terminated_length": 103.08695983886719, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8848, "grad_norm": 6.238147735595703, "kl": 0.2109375, "learning_rate": 1e-06, "loss": 0.1147, "num_tokens": 14390325.0, "reward": 0.10796605050563812, "reward_std": 0.028253143653273582, "rewards/bleu_reward_func/mean": 0.10796605050563812, "rewards/bleu_reward_func/std": 0.08980042487382889, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 334.3125, "completions/mean_terminated_length": 177.5294189453125, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.8856, "grad_norm": 5.977288246154785, "kl": 0.12933349609375, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 14405719.0, "reward": 0.08215081691741943, "reward_std": 0.012335095554590225, "rewards/bleu_reward_func/mean": 0.08215081691741943, "rewards/bleu_reward_func/std": 0.0935206413269043, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 368.25, "completions/mean_terminated_length": 224.5, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.8864, "grad_norm": 3.1431217193603516, "kl": 0.046478271484375, "learning_rate": 1e-06, "loss": 0.0529, "num_tokens": 14420959.0, "reward": 0.07733479142189026, "reward_std": 0.04769892990589142, "rewards/bleu_reward_func/mean": 0.07733479142189026, "rewards/bleu_reward_func/std": 0.07268877327442169, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 205.0625, "completions/mean_terminated_length": 148.22222900390625, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.8872, "grad_norm": 8.718172073364258, "kl": 0.27215576171875, "learning_rate": 1e-06, "loss": -0.0245, "num_tokens": 14431297.0, "reward": 0.08119820058345795, "reward_std": 0.02793770469725132, "rewards/bleu_reward_func/mean": 0.08119820058345795, "rewards/bleu_reward_func/std": 0.046033825725317, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 218.59375, "completions/mean_terminated_length": 188.2413787841797, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.888, "grad_norm": 7.532926082611084, "kl": 0.197265625, "learning_rate": 1e-06, "loss": 0.2, "num_tokens": 14440556.0, "reward": 0.16878977417945862, "reward_std": 0.06408128887414932, "rewards/bleu_reward_func/mean": 0.16878977417945862, "rewards/bleu_reward_func/std": 0.17819638550281525, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 169.1666717529297, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.8888, "grad_norm": 5.752809047698975, "kl": 0.151947021484375, "learning_rate": 1e-06, "loss": -0.0323, "num_tokens": 14453832.0, "reward": 0.08754751831293106, "reward_std": 0.041982948780059814, "rewards/bleu_reward_func/mean": 0.08754751831293106, "rewards/bleu_reward_func/std": 0.08986286073923111, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 431.9375, "completions/mean_terminated_length": 298.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.8896, "grad_norm": 2.506591320037842, "kl": 0.04913330078125, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 14470966.0, "reward": 0.05948667228221893, "reward_std": 0.031033214181661606, "rewards/bleu_reward_func/mean": 0.05948667228221893, "rewards/bleu_reward_func/std": 0.04187482222914696, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 458.125, "completions/mean_terminated_length": 404.25, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.8904, "grad_norm": 2.1371476650238037, "kl": 0.05169677734375, "learning_rate": 1e-06, "loss": 0.0275, "num_tokens": 14488386.0, "reward": 0.07635128498077393, "reward_std": 0.02666424587368965, "rewards/bleu_reward_func/mean": 0.07635128498077393, "rewards/bleu_reward_func/std": 0.06230180338025093, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 381.3125, "completions/mean_terminated_length": 321.9090881347656, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.8912, "grad_norm": 2.5617504119873047, "kl": 0.05352783203125, "learning_rate": 1e-06, "loss": -0.0071, "num_tokens": 14502500.0, "reward": 0.1177460253238678, "reward_std": 0.02880302257835865, "rewards/bleu_reward_func/mean": 0.1177460253238678, "rewards/bleu_reward_func/std": 0.06036384403705597, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 293.39129638671875, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.892, "grad_norm": 6.915249824523926, "kl": 0.162353515625, "learning_rate": 1e-06, "loss": -0.1012, "num_tokens": 14515272.0, "reward": 0.08914826065301895, "reward_std": 0.03028780408203602, "rewards/bleu_reward_func/mean": 0.08914826065301895, "rewards/bleu_reward_func/std": 0.042083028703927994, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 145.96875, "completions/mean_terminated_length": 61.500003814697266, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8928, "grad_norm": 8.211282730102539, "kl": 0.33642578125, "learning_rate": 1e-06, "loss": -0.0331, "num_tokens": 14523199.0, "reward": 0.13895554840564728, "reward_std": 0.06001996994018555, "rewards/bleu_reward_func/mean": 0.13895554840564728, "rewards/bleu_reward_func/std": 0.10717976838350296, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 269.8125, "completions/mean_terminated_length": 175.04348754882812, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.8936, "grad_norm": 9.093502044677734, "kl": 0.18402099609375, "learning_rate": 1e-06, "loss": -0.0448, "num_tokens": 14535921.0, "reward": 0.08600494265556335, "reward_std": 0.01430382952094078, "rewards/bleu_reward_func/mean": 0.08600494265556335, "rewards/bleu_reward_func/std": 0.03352402523159981, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 220.65625, "completions/mean_terminated_length": 166.70370483398438, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.8944, "grad_norm": 5.133798122406006, "kl": 0.23126220703125, "learning_rate": 1e-06, "loss": -0.0611, "num_tokens": 14546502.0, "reward": 0.1281801015138626, "reward_std": 0.033460669219493866, "rewards/bleu_reward_func/mean": 0.1281801015138626, "rewards/bleu_reward_func/std": 0.09999439865350723, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 414.90625, "completions/mean_terminated_length": 382.54168701171875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.8952, "grad_norm": 2.3336856365203857, "kl": 0.05023193359375, "learning_rate": 1e-06, "loss": -0.0381, "num_tokens": 14563043.0, "reward": 0.09009081870317459, "reward_std": 0.024957649409770966, "rewards/bleu_reward_func/mean": 0.09009081870317459, "rewards/bleu_reward_func/std": 0.07389495521783829, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 249.375, "completions/mean_terminated_length": 188.7692413330078, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.896, "grad_norm": 4.001770496368408, "kl": 0.06622314453125, "learning_rate": 1e-06, "loss": 0.4376, "num_tokens": 14574703.0, "reward": 0.12255299836397171, "reward_std": 0.06612245738506317, "rewards/bleu_reward_func/mean": 0.12255299836397171, "rewards/bleu_reward_func/std": 0.1745522916316986, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 341.96875, "completions/mean_terminated_length": 264.68182373046875, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.8968, "grad_norm": 5.190872669219971, "kl": 0.0662841796875, "learning_rate": 1e-06, "loss": -0.2989, "num_tokens": 14590934.0, "reward": 0.015649927780032158, "reward_std": 0.008883368223905563, "rewards/bleu_reward_func/mean": 0.015649927780032158, "rewards/bleu_reward_func/std": 0.014249512925744057, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 204.71875, "completions/mean_terminated_length": 118.68000030517578, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.8976, "grad_norm": 7.992037296295166, "kl": 0.22357177734375, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 14601957.0, "reward": 0.14590373635292053, "reward_std": 0.032411251217126846, "rewards/bleu_reward_func/mean": 0.14590373635292053, "rewards/bleu_reward_func/std": 0.1304609477519989, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 298.4375, "completions/mean_terminated_length": 132.3333282470703, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.8984, "grad_norm": 6.297399044036865, "kl": 0.05999755859375, "learning_rate": 1e-06, "loss": 0.3021, "num_tokens": 14616019.0, "reward": 0.071531280875206, "reward_std": 0.02597668580710888, "rewards/bleu_reward_func/mean": 0.071531280875206, "rewards/bleu_reward_func/std": 0.05073075741529465, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 353.90625, "completions/mean_terminated_length": 174.73333740234375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.8992, "grad_norm": 2.9494781494140625, "kl": 0.0675048828125, "learning_rate": 1e-06, "loss": -0.0383, "num_tokens": 14634248.0, "reward": 0.20859137177467346, "reward_std": 0.026030534878373146, "rewards/bleu_reward_func/mean": 0.20859137177467346, "rewards/bleu_reward_func/std": 0.25668489933013916, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 337.03125, "completions/mean_terminated_length": 200.94444274902344, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.9, "grad_norm": 3.6260087490081787, "kl": 0.06976318359375, "learning_rate": 1e-06, "loss": -0.0424, "num_tokens": 14646505.0, "reward": 0.03524015098810196, "reward_std": 0.021195726469159126, "rewards/bleu_reward_func/mean": 0.03524015098810196, "rewards/bleu_reward_func/std": 0.04081031307578087, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 117.39130401611328, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.9008, "grad_norm": 6.014510154724121, "kl": 0.232421875, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 14657525.0, "reward": 0.06341119110584259, "reward_std": 0.03255104646086693, "rewards/bleu_reward_func/mean": 0.06341119110584259, "rewards/bleu_reward_func/std": 0.04315832257270813, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 238.21875, "completions/mean_terminated_length": 175.03846740722656, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9016, "grad_norm": 4.443993091583252, "kl": 0.07666015625, "learning_rate": 1e-06, "loss": 0.1492, "num_tokens": 14669820.0, "reward": 0.050140924751758575, "reward_std": 0.02134326659142971, "rewards/bleu_reward_func/mean": 0.050140924751758575, "rewards/bleu_reward_func/std": 0.054666925221681595, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 206.8125, "completions/mean_terminated_length": 121.36000061035156, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9024, "grad_norm": 6.487152099609375, "kl": 0.30072021484375, "learning_rate": 1e-06, "loss": 0.1095, "num_tokens": 14678878.0, "reward": 0.20913344621658325, "reward_std": 0.06204414367675781, "rewards/bleu_reward_func/mean": 0.20913344621658325, "rewards/bleu_reward_func/std": 0.15058699250221252, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 168.5625, "completions/mean_terminated_length": 104.96296691894531, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.9032, "grad_norm": 5.987677097320557, "kl": 0.161376953125, "learning_rate": 1e-06, "loss": 0.1159, "num_tokens": 14690400.0, "reward": 0.22108127176761627, "reward_std": 0.03181886300444603, "rewards/bleu_reward_func/mean": 0.22108127176761627, "rewards/bleu_reward_func/std": 0.21734413504600525, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 224.40625, "completions/mean_terminated_length": 194.65516662597656, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.904, "grad_norm": 4.591039180755615, "kl": 0.1512451171875, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 14703621.0, "reward": 0.13979627192020416, "reward_std": 0.024196792393922806, "rewards/bleu_reward_func/mean": 0.13979627192020416, "rewards/bleu_reward_func/std": 0.11370246112346649, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 191.96875, "completions/mean_terminated_length": 170.6333465576172, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9048, "grad_norm": 9.648924827575684, "kl": 0.3162841796875, "learning_rate": 1e-06, "loss": 0.0993, "num_tokens": 14715324.0, "reward": 0.24474243819713593, "reward_std": 0.03903892636299133, "rewards/bleu_reward_func/mean": 0.24474243819713593, "rewards/bleu_reward_func/std": 0.111121766269207, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 158.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.9056, "grad_norm": 3.5864415168762207, "kl": 0.05462646484375, "learning_rate": 1e-06, "loss": 0.0438, "num_tokens": 14728656.0, "reward": 0.08658318221569061, "reward_std": 0.03171471878886223, "rewards/bleu_reward_func/mean": 0.08658318221569061, "rewards/bleu_reward_func/std": 0.05243143439292908, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 196.90625, "completions/mean_terminated_length": 186.74192810058594, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9064, "grad_norm": 5.391827583312988, "kl": 0.149871826171875, "learning_rate": 1e-06, "loss": 0.209, "num_tokens": 14743445.0, "reward": 0.14488989114761353, "reward_std": 0.05979035794734955, "rewards/bleu_reward_func/mean": 0.14488989114761353, "rewards/bleu_reward_func/std": 0.11484233289957047, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 40.0, "completions/mean_length": 147.9375, "completions/mean_terminated_length": 26.58333396911621, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.9072, "grad_norm": 9.304245948791504, "kl": 0.4073486328125, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 14755539.0, "reward": 0.2897959053516388, "reward_std": 0.11407680809497833, "rewards/bleu_reward_func/mean": 0.2897959053516388, "rewards/bleu_reward_func/std": 0.3296494483947754, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 302.0625, "completions/mean_terminated_length": 219.9130401611328, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.908, "grad_norm": 4.041188716888428, "kl": 0.08953857421875, "learning_rate": 1e-06, "loss": 0.0784, "num_tokens": 14771221.0, "reward": 0.0867033302783966, "reward_std": 0.03230535611510277, "rewards/bleu_reward_func/mean": 0.0867033302783966, "rewards/bleu_reward_func/std": 0.05712318420410156, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 353.21875, "completions/mean_terminated_length": 244.57894897460938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9088, "grad_norm": 3.4682111740112305, "kl": 0.063140869140625, "learning_rate": 1e-06, "loss": -0.013, "num_tokens": 14786884.0, "reward": 0.2040112018585205, "reward_std": 0.03282826021313667, "rewards/bleu_reward_func/mean": 0.2040112018585205, "rewards/bleu_reward_func/std": 0.2342006117105484, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 323.46875, "completions/mean_terminated_length": 176.8333282470703, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9096, "grad_norm": 9.80130672454834, "kl": 0.296661376953125, "learning_rate": 1e-06, "loss": 0.0879, "num_tokens": 14807267.0, "reward": 0.15428093075752258, "reward_std": 0.047303371131420135, "rewards/bleu_reward_func/mean": 0.15428093075752258, "rewards/bleu_reward_func/std": 0.12975330650806427, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 213.3125, "completions/mean_terminated_length": 113.75, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9104, "grad_norm": 6.25737190246582, "kl": 0.125823974609375, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 14815933.0, "reward": 0.0721752792596817, "reward_std": 0.021741271018981934, "rewards/bleu_reward_func/mean": 0.0721752792596817, "rewards/bleu_reward_func/std": 0.05829243361949921, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 107.125, "completions/mean_terminated_length": 107.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.9112, "grad_norm": 25.477249145507812, "kl": 0.55877685546875, "learning_rate": 1e-06, "loss": -0.2259, "num_tokens": 14825321.0, "reward": 0.1802026480436325, "reward_std": 0.11938925087451935, "rewards/bleu_reward_func/mean": 0.1802026480436325, "rewards/bleu_reward_func/std": 0.1613418012857437, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 105.3125, "completions/mean_terminated_length": 105.3125, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.912, "grad_norm": 11.604246139526367, "kl": 0.2550048828125, "learning_rate": 1e-06, "loss": -0.0128, "num_tokens": 14831115.0, "reward": 0.19547826051712036, "reward_std": 0.07176055759191513, "rewards/bleu_reward_func/mean": 0.19547826051712036, "rewards/bleu_reward_func/std": 0.11230416595935822, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 255.59375, "completions/mean_terminated_length": 208.11111450195312, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.9128, "grad_norm": 6.936506271362305, "kl": 0.14453125, "learning_rate": 1e-06, "loss": -0.1892, "num_tokens": 14847286.0, "reward": 0.07467533648014069, "reward_std": 0.0442538745701313, "rewards/bleu_reward_func/mean": 0.07467533648014069, "rewards/bleu_reward_func/std": 0.0976758524775505, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 397.8125, "completions/mean_terminated_length": 309.0, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9136, "grad_norm": 3.533027410507202, "kl": 0.1064453125, "learning_rate": 1e-06, "loss": 0.2572, "num_tokens": 14862600.0, "reward": 0.07055975496768951, "reward_std": 0.024620652198791504, "rewards/bleu_reward_func/mean": 0.07055975496768951, "rewards/bleu_reward_func/std": 0.04198145866394043, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 512.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 374.34375, "completions/mean_terminated_length": 144.9166717529297, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.9144, "grad_norm": 3.106947422027588, "kl": 0.05267333984375, "learning_rate": 1e-06, "loss": 0.3129, "num_tokens": 14879211.0, "reward": 0.047079749405384064, "reward_std": 0.01572955772280693, "rewards/bleu_reward_func/mean": 0.047079749405384064, "rewards/bleu_reward_func/std": 0.03182151913642883, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 296.21875, "completions/mean_terminated_length": 289.258056640625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.9152, "grad_norm": 9.539961814880371, "kl": 0.26116943359375, "learning_rate": 1e-06, "loss": 0.3821, "num_tokens": 14892074.0, "reward": 0.04650323465466499, "reward_std": 0.016895011067390442, "rewards/bleu_reward_func/mean": 0.04650323465466499, "rewards/bleu_reward_func/std": 0.020110802724957466, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 50.105262756347656, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.916, "grad_norm": 10.258657455444336, "kl": 0.3250732421875, "learning_rate": 1e-06, "loss": -0.0619, "num_tokens": 14904698.0, "reward": 0.17951351404190063, "reward_std": 0.07376629114151001, "rewards/bleu_reward_func/mean": 0.17951351404190063, "rewards/bleu_reward_func/std": 0.17956046760082245, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 286.78125, "completions/mean_terminated_length": 151.65000915527344, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9168, "grad_norm": 6.667459964752197, "kl": 0.08001708984375, "learning_rate": 1e-06, "loss": 0.5727, "num_tokens": 14917451.0, "reward": 0.03467312082648277, "reward_std": 0.01676066778600216, "rewards/bleu_reward_func/mean": 0.03467312082648277, "rewards/bleu_reward_func/std": 0.02004723809659481, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 29.0, "completions/mean_length": 266.90625, "completions/mean_terminated_length": 21.8125, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9176, "grad_norm": 7.602176189422607, "kl": 0.4158935546875, "learning_rate": 1e-06, "loss": 0.038, "num_tokens": 14932352.0, "reward": 0.19471007585525513, "reward_std": 0.04646201431751251, "rewards/bleu_reward_func/mean": 0.19471007585525513, "rewards/bleu_reward_func/std": 0.1760382354259491, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 315.15625, "completions/mean_terminated_length": 162.05555725097656, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.9184, "grad_norm": 6.479866027832031, "kl": 0.109619140625, "learning_rate": 1e-06, "loss": 0.0595, "num_tokens": 14947741.0, "reward": 0.10786274820566177, "reward_std": 0.036432720720767975, "rewards/bleu_reward_func/mean": 0.10786274820566177, "rewards/bleu_reward_func/std": 0.0789819210767746, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 472.5, "completions/mean_terminated_length": 427.7333679199219, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.9192, "grad_norm": 2.4095027446746826, "kl": 0.0462646484375, "learning_rate": 1e-06, "loss": 0.0305, "num_tokens": 14966173.0, "reward": 0.1280764639377594, "reward_std": 0.03749135136604309, "rewards/bleu_reward_func/mean": 0.1280764639377594, "rewards/bleu_reward_func/std": 0.05864708498120308, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 255.90625, "completions/mean_terminated_length": 29.941177368164062, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "epoch": 0.92, "grad_norm": 10.67159652709961, "kl": 0.1556396484375, "learning_rate": 1e-06, "loss": 0.1007, "num_tokens": 14979570.0, "reward": 0.055816084146499634, "reward_std": 0.020302332937717438, "rewards/bleu_reward_func/mean": 0.055816084146499634, "rewards/bleu_reward_func/std": 0.03035581111907959, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 210.78125, "completions/mean_terminated_length": 167.75, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.9208, "grad_norm": 6.78237771987915, "kl": 0.17840576171875, "learning_rate": 1e-06, "loss": 0.09, "num_tokens": 14993923.0, "reward": 0.07397650182247162, "reward_std": 0.01999451220035553, "rewards/bleu_reward_func/mean": 0.07397650182247162, "rewards/bleu_reward_func/std": 0.0341508574783802, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 227.34375, "completions/mean_terminated_length": 115.95652770996094, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9216, "grad_norm": 9.095620155334473, "kl": 0.483001708984375, "learning_rate": 1e-06, "loss": 0.2875, "num_tokens": 15005734.0, "reward": 0.1028270274400711, "reward_std": 0.05264887586236, "rewards/bleu_reward_func/mean": 0.1028270274400711, "rewards/bleu_reward_func/std": 0.08930659294128418, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 436.0625, "completions/mean_terminated_length": 269.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.9224, "grad_norm": 2.607079029083252, "kl": 0.04913330078125, "learning_rate": 1e-06, "loss": 0.2182, "num_tokens": 15021632.0, "reward": 0.08679656684398651, "reward_std": 0.05561990663409233, "rewards/bleu_reward_func/mean": 0.08679656684398651, "rewards/bleu_reward_func/std": 0.09985605627298355, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 248.28125, "completions/mean_terminated_length": 128.40908813476562, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9232, "grad_norm": 6.871771812438965, "kl": 0.18536376953125, "learning_rate": 1e-06, "loss": 0.02, "num_tokens": 15035081.0, "reward": 0.19525909423828125, "reward_std": 0.04538525268435478, "rewards/bleu_reward_func/mean": 0.19525909423828125, "rewards/bleu_reward_func/std": 0.18253828585147858, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 285.65625, "completions/mean_terminated_length": 59.3125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.924, "grad_norm": 6.028687953948975, "kl": 0.0938720703125, "learning_rate": 1e-06, "loss": 0.2695, "num_tokens": 15049558.0, "reward": 0.09561645239591599, "reward_std": 0.0469173789024353, "rewards/bleu_reward_func/mean": 0.09561645239591599, "rewards/bleu_reward_func/std": 0.07949265837669373, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 237.875, "completions/mean_terminated_length": 161.1199951171875, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.9248, "grad_norm": 18.29808807373047, "kl": 0.55096435546875, "learning_rate": 1e-06, "loss": 0.0911, "num_tokens": 15060954.0, "reward": 0.1312306672334671, "reward_std": 0.05109435319900513, "rewards/bleu_reward_func/mean": 0.1312306672334671, "rewards/bleu_reward_func/std": 0.0968712568283081, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 183.40625, "completions/mean_terminated_length": 73.875, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 0.9256, "grad_norm": 8.553780555725098, "kl": 0.2044677734375, "learning_rate": 1e-06, "loss": 0.3837, "num_tokens": 15071151.0, "reward": 0.08946996927261353, "reward_std": 0.032011546194553375, "rewards/bleu_reward_func/mean": 0.08946996927261353, "rewards/bleu_reward_func/std": 0.08429201692342758, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 187.40625, "completions/mean_terminated_length": 153.8275909423828, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "epoch": 0.9264, "grad_norm": 7.326318740844727, "kl": 0.14752197265625, "learning_rate": 1e-06, "loss": 0.5061, "num_tokens": 15080252.0, "reward": 0.1023559644818306, "reward_std": 0.045405931770801544, "rewards/bleu_reward_func/mean": 0.1023559644818306, "rewards/bleu_reward_func/std": 0.07145705074071884, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 304.4375, "completions/mean_terminated_length": 246.3199920654297, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.9272, "grad_norm": 5.513801574707031, "kl": 0.1817626953125, "learning_rate": 1e-06, "loss": 0.1149, "num_tokens": 15095610.0, "reward": 0.11080615222454071, "reward_std": 0.043468981981277466, "rewards/bleu_reward_func/mean": 0.11080615222454071, "rewards/bleu_reward_func/std": 0.04713428020477295, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 205.40625, "completions/mean_terminated_length": 161.60714721679688, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.928, "grad_norm": 7.627071857452393, "kl": 0.318359375, "learning_rate": 1e-06, "loss": 0.1805, "num_tokens": 15104167.0, "reward": 0.0735570564866066, "reward_std": 0.02132660523056984, "rewards/bleu_reward_func/mean": 0.0735570564866066, "rewards/bleu_reward_func/std": 0.05421363562345505, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 156.17391967773438, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9288, "grad_norm": 6.11322021484375, "kl": 0.27099609375, "learning_rate": 1e-06, "loss": 0.1028, "num_tokens": 15118823.0, "reward": 0.1414988487958908, "reward_std": 0.03222941979765892, "rewards/bleu_reward_func/mean": 0.1414988487958908, "rewards/bleu_reward_func/std": 0.15351709723472595, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 417.8125, "completions/mean_terminated_length": 238.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.9296, "grad_norm": 3.5942113399505615, "kl": 0.08154296875, "learning_rate": 1e-06, "loss": 0.1689, "num_tokens": 15139105.0, "reward": 0.12588296830654144, "reward_std": 0.04371759667992592, "rewards/bleu_reward_func/mean": 0.12588296830654144, "rewards/bleu_reward_func/std": 0.14081913232803345, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 273.3846130371094, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.9304, "grad_norm": 3.5201575756073, "kl": 0.145751953125, "learning_rate": 1e-06, "loss": -0.0396, "num_tokens": 15151437.0, "reward": 0.06649903953075409, "reward_std": 0.024907082319259644, "rewards/bleu_reward_func/mean": 0.06649903953075409, "rewards/bleu_reward_func/std": 0.04641694948077202, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 263.8125, "completions/mean_terminated_length": 166.69564819335938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9312, "grad_norm": 5.745899200439453, "kl": 0.124755859375, "learning_rate": 1e-06, "loss": 0.1729, "num_tokens": 15164791.0, "reward": 0.33342817425727844, "reward_std": 0.06225915253162384, "rewards/bleu_reward_func/mean": 0.33342817425727844, "rewards/bleu_reward_func/std": 0.3276880383491516, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 280.1875, "completions/mean_terminated_length": 272.70965576171875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.932, "grad_norm": 7.059730052947998, "kl": 0.154541015625, "learning_rate": 1e-06, "loss": 0.2383, "num_tokens": 15176629.0, "reward": 0.03160897642374039, "reward_std": 0.010618302971124649, "rewards/bleu_reward_func/mean": 0.03160897642374039, "rewards/bleu_reward_func/std": 0.016612010076642036, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 295.59375, "completions/mean_terminated_length": 281.16668701171875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.9328, "grad_norm": 5.280132293701172, "kl": 0.13616943359375, "learning_rate": 1e-06, "loss": 0.3116, "num_tokens": 15188168.0, "reward": 0.03221059590578079, "reward_std": 0.02213170751929283, "rewards/bleu_reward_func/mean": 0.03221059590578079, "rewards/bleu_reward_func/std": 0.03985392674803734, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 132.5, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.9336, "grad_norm": 6.5651021003723145, "kl": 0.142578125, "learning_rate": 1e-06, "loss": 0.0658, "num_tokens": 15201892.0, "reward": 0.1807694286108017, "reward_std": 0.1409318894147873, "rewards/bleu_reward_func/mean": 0.1807694286108017, "rewards/bleu_reward_func/std": 0.2797906994819641, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 190.34483337402344, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9344, "grad_norm": 7.170085430145264, "kl": 0.400634765625, "learning_rate": 1e-06, "loss": 0.0572, "num_tokens": 15212724.0, "reward": 0.1451285183429718, "reward_std": 0.0673985704779625, "rewards/bleu_reward_func/mean": 0.1451285183429718, "rewards/bleu_reward_func/std": 0.12026475369930267, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 313.75, "completions/mean_terminated_length": 247.6666717529297, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.9352, "grad_norm": 5.603696823120117, "kl": 0.152587890625, "learning_rate": 1e-06, "loss": 0.0616, "num_tokens": 15228428.0, "reward": 0.253650963306427, "reward_std": 0.03022560104727745, "rewards/bleu_reward_func/mean": 0.253650963306427, "rewards/bleu_reward_func/std": 0.3357136845588684, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 154.60870361328125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.936, "grad_norm": 6.110992431640625, "kl": 0.1976318359375, "learning_rate": 1e-06, "loss": -0.0162, "num_tokens": 15242768.0, "reward": 0.0904015377163887, "reward_std": 0.025095967575907707, "rewards/bleu_reward_func/mean": 0.0904015377163887, "rewards/bleu_reward_func/std": 0.09678779542446136, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 241.625, "completions/mean_terminated_length": 213.65516662597656, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "epoch": 0.9368, "grad_norm": 7.32260799407959, "kl": 0.28955078125, "learning_rate": 1e-06, "loss": 0.2098, "num_tokens": 15256260.0, "reward": 0.09696318954229355, "reward_std": 0.04769141972064972, "rewards/bleu_reward_func/mean": 0.09696318954229355, "rewards/bleu_reward_func/std": 0.07404191046953201, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 189.96875, "completions/mean_terminated_length": 156.65516662597656, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9376, "grad_norm": 8.43685531616211, "kl": 0.293212890625, "learning_rate": 1e-06, "loss": -0.1046, "num_tokens": 15269899.0, "reward": 0.09695740044116974, "reward_std": 0.037594642490148544, "rewards/bleu_reward_func/mean": 0.09695740044116974, "rewards/bleu_reward_func/std": 0.05750608071684837, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 343.03125, "completions/mean_terminated_length": 125.78572082519531, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.9384, "grad_norm": 5.582010269165039, "kl": 0.410400390625, "learning_rate": 1e-06, "loss": -0.1385, "num_tokens": 15287108.0, "reward": 0.1451946198940277, "reward_std": 0.03915205970406532, "rewards/bleu_reward_func/mean": 0.1451946198940277, "rewards/bleu_reward_func/std": 0.17974776029586792, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 331.84375, "completions/mean_terminated_length": 261.34783935546875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.9392, "grad_norm": 11.770851135253906, "kl": 0.3424072265625, "learning_rate": 1e-06, "loss": 0.1869, "num_tokens": 15304535.0, "reward": 0.1413375586271286, "reward_std": 0.06474150717258453, "rewards/bleu_reward_func/mean": 0.1413375586271286, "rewards/bleu_reward_func/std": 0.0782981589436531, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 299.46875, "completions/mean_terminated_length": 188.1428680419922, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.94, "grad_norm": 5.467903137207031, "kl": 0.198486328125, "learning_rate": 1e-06, "loss": -0.1137, "num_tokens": 15317078.0, "reward": 0.14337725937366486, "reward_std": 0.03686724230647087, "rewards/bleu_reward_func/mean": 0.14337725937366486, "rewards/bleu_reward_func/std": 0.16096609830856323, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 253.8125, "completions/mean_terminated_length": 181.51998901367188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9408, "grad_norm": 8.620965957641602, "kl": 0.379150390625, "learning_rate": 1e-06, "loss": -0.0426, "num_tokens": 15328768.0, "reward": 0.1426527500152588, "reward_std": 0.0550708994269371, "rewards/bleu_reward_func/mean": 0.1426527500152588, "rewards/bleu_reward_func/std": 0.12621666491031647, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 193.21875, "completions/mean_terminated_length": 86.95833587646484, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.9416, "grad_norm": 10.362127304077148, "kl": 0.565673828125, "learning_rate": 1e-06, "loss": 0.0348, "num_tokens": 15340919.0, "reward": 0.04140050709247589, "reward_std": 0.019718483090400696, "rewards/bleu_reward_func/mean": 0.04140050709247589, "rewards/bleu_reward_func/std": 0.03685431182384491, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 349.5625, "completions/mean_terminated_length": 223.22222900390625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.9424, "grad_norm": 5.097819805145264, "kl": 0.2220458984375, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 15356545.0, "reward": 0.14949634671211243, "reward_std": 0.05013212561607361, "rewards/bleu_reward_func/mean": 0.14949634671211243, "rewards/bleu_reward_func/std": 0.19787877798080444, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 160.03125, "completions/mean_terminated_length": 123.62068939208984, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.9432, "grad_norm": 10.772759437561035, "kl": 0.300537109375, "learning_rate": 1e-06, "loss": 0.0346, "num_tokens": 15364554.0, "reward": 0.1290975958108902, "reward_std": 0.07744569331407547, "rewards/bleu_reward_func/mean": 0.1290975958108902, "rewards/bleu_reward_func/std": 0.1356077641248703, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 378.1875, "completions/mean_terminated_length": 122.72727966308594, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.944, "grad_norm": 6.254019260406494, "kl": 0.2237548828125, "learning_rate": 1e-06, "loss": 0.0559, "num_tokens": 15382792.0, "reward": 0.11257205903530121, "reward_std": 0.036544833332300186, "rewards/bleu_reward_func/mean": 0.11257205903530121, "rewards/bleu_reward_func/std": 0.10338166356086731, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 185.75, "completions/mean_terminated_length": 37.45454788208008, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9448, "grad_norm": 30.484220504760742, "kl": 1.228515625, "learning_rate": 1e-06, "loss": 0.7897, "num_tokens": 15391000.0, "reward": 0.2840408384799957, "reward_std": 0.15822984278202057, "rewards/bleu_reward_func/mean": 0.2840408384799957, "rewards/bleu_reward_func/std": 0.1896887719631195, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 237.8125, "completions/mean_terminated_length": 130.52174377441406, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9456, "grad_norm": 18.71758270263672, "kl": 1.376953125, "learning_rate": 1e-06, "loss": 0.403, "num_tokens": 15401594.0, "reward": 0.16319331526756287, "reward_std": 0.07705336064100266, "rewards/bleu_reward_func/mean": 0.16319331526756287, "rewards/bleu_reward_func/std": 0.21551194787025452, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 301.6875, "completions/mean_terminated_length": 175.5, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9464, "grad_norm": 10.93260383605957, "kl": 0.89013671875, "learning_rate": 1e-06, "loss": 0.073, "num_tokens": 15415696.0, "reward": 0.1893438994884491, "reward_std": 0.050289541482925415, "rewards/bleu_reward_func/mean": 0.1893438994884491, "rewards/bleu_reward_func/std": 0.2888805866241455, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 512.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 409.75, "completions/mean_terminated_length": 184.8000030517578, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.9472, "grad_norm": 18.76006317138672, "kl": 0.81097412109375, "learning_rate": 1e-06, "loss": 0.1983, "num_tokens": 15436760.0, "reward": 0.06023106724023819, "reward_std": 0.03375660628080368, "rewards/bleu_reward_func/mean": 0.06023106724023819, "rewards/bleu_reward_func/std": 0.07748028635978699, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 188.42105102539062, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.948, "grad_norm": 8.845364570617676, "kl": 1.5517578125, "learning_rate": 1e-06, "loss": 0.1722, "num_tokens": 15449196.0, "reward": 0.08687852323055267, "reward_std": 0.02910173125565052, "rewards/bleu_reward_func/mean": 0.08687852323055267, "rewards/bleu_reward_func/std": 0.06549690663814545, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 192.5625, "completions/mean_terminated_length": 86.08333587646484, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.9488, "grad_norm": 17.081815719604492, "kl": 0.970703125, "learning_rate": 1e-06, "loss": 0.1558, "num_tokens": 15458006.0, "reward": 0.10367533564567566, "reward_std": 0.04477589949965477, "rewards/bleu_reward_func/mean": 0.10367533564567566, "rewards/bleu_reward_func/std": 0.1153038814663887, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 512.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 165.3125, "completions/mean_terminated_length": 115.78572082519531, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.9496, "grad_norm": 18.12542152404785, "kl": 1.48876953125, "learning_rate": 1e-06, "loss": 0.1714, "num_tokens": 15469048.0, "reward": 0.14421464502811432, "reward_std": 0.05387473851442337, "rewards/bleu_reward_func/mean": 0.14421464502811432, "rewards/bleu_reward_func/std": 0.11958708614110947, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 170.8125, "completions/mean_terminated_length": 92.0769271850586, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.9504, "grad_norm": 19.755910873413086, "kl": 1.64697265625, "learning_rate": 1e-06, "loss": 0.083, "num_tokens": 15478610.0, "reward": 0.11767937242984772, "reward_std": 0.038736552000045776, "rewards/bleu_reward_func/mean": 0.11767937242984772, "rewards/bleu_reward_func/std": 0.14759457111358643, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 303.34375, "completions/mean_terminated_length": 233.7916717529297, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.9512, "grad_norm": 7.415678024291992, "kl": 0.464111328125, "learning_rate": 1e-06, "loss": -0.0998, "num_tokens": 15493469.0, "reward": 0.13566911220550537, "reward_std": 0.03367416933178902, "rewards/bleu_reward_func/mean": 0.13566911220550537, "rewards/bleu_reward_func/std": 0.16353026032447815, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 371.59375, "completions/mean_terminated_length": 307.7727355957031, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.952, "grad_norm": 4.499618053436279, "kl": 0.21990966796875, "learning_rate": 1e-06, "loss": -0.0327, "num_tokens": 15510992.0, "reward": 0.05285275727510452, "reward_std": 0.026856746524572372, "rewards/bleu_reward_func/mean": 0.05285275727510452, "rewards/bleu_reward_func/std": 0.028620464727282524, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 155.8518524169922, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9528, "grad_norm": 11.282941818237305, "kl": 0.353515625, "learning_rate": 1e-06, "loss": -0.0409, "num_tokens": 15519336.0, "reward": 0.09386638551950455, "reward_std": 0.03402595967054367, "rewards/bleu_reward_func/mean": 0.09386638551950455, "rewards/bleu_reward_func/std": 0.08018817007541656, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 53.5625, "completions/mean_terminated_length": 53.5625, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9536, "grad_norm": 30.713850021362305, "kl": 0.921142578125, "learning_rate": 1e-06, "loss": 0.5322, "num_tokens": 15524298.0, "reward": 0.1515873372554779, "reward_std": 0.03387141600251198, "rewards/bleu_reward_func/mean": 0.1515873372554779, "rewards/bleu_reward_func/std": 0.14795146882534027, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 307.78125, "completions/mean_terminated_length": 148.94444274902344, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.9544, "grad_norm": 8.172345161437988, "kl": 0.3499755859375, "learning_rate": 1e-06, "loss": -0.1691, "num_tokens": 15539683.0, "reward": 0.06142358481884003, "reward_std": 0.017768073827028275, "rewards/bleu_reward_func/mean": 0.06142358481884003, "rewards/bleu_reward_func/std": 0.02769811637699604, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 59.1875, "completions/mean_terminated_length": 44.58064270019531, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9552, "grad_norm": 10.542518615722656, "kl": 0.6220703125, "learning_rate": 1e-06, "loss": -0.2357, "num_tokens": 15546073.0, "reward": 0.13228365778923035, "reward_std": 0.05075054615736008, "rewards/bleu_reward_func/mean": 0.13228365778923035, "rewards/bleu_reward_func/std": 0.14825788140296936, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 351.75, "completions/mean_terminated_length": 255.60000610351562, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.956, "grad_norm": 14.762670516967773, "kl": 0.54180908203125, "learning_rate": 1e-06, "loss": 0.1487, "num_tokens": 15561137.0, "reward": 0.10621648281812668, "reward_std": 0.04206620901823044, "rewards/bleu_reward_func/mean": 0.10621648281812668, "rewards/bleu_reward_func/std": 0.05661296099424362, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 363.1875, "completions/mean_terminated_length": 285.23809814453125, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.9568, "grad_norm": 2.930482864379883, "kl": 0.08624267578125, "learning_rate": 1e-06, "loss": -0.2447, "num_tokens": 15575311.0, "reward": 0.026056351140141487, "reward_std": 0.01072642207145691, "rewards/bleu_reward_func/mean": 0.026056351140141487, "rewards/bleu_reward_func/std": 0.014662106521427631, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 334.6875, "completions/mean_terminated_length": 228.3000030517578, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.9576, "grad_norm": 4.765532493591309, "kl": 0.20489501953125, "learning_rate": 1e-06, "loss": -0.0713, "num_tokens": 15591997.0, "reward": 0.18965111672878265, "reward_std": 0.03347271308302879, "rewards/bleu_reward_func/mean": 0.18965111672878265, "rewards/bleu_reward_func/std": 0.24074162542819977, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 206.71875, "completions/mean_terminated_length": 186.36668395996094, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9584, "grad_norm": 9.595136642456055, "kl": 0.4879150390625, "learning_rate": 1e-06, "loss": -0.0063, "num_tokens": 15601852.0, "reward": 0.07087633013725281, "reward_std": 0.024902882054448128, "rewards/bleu_reward_func/mean": 0.07087633013725281, "rewards/bleu_reward_func/std": 0.052057161927223206, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 286.59375, "completions/mean_terminated_length": 223.47999572753906, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.9592, "grad_norm": 5.445902347564697, "kl": 0.289306640625, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 15613743.0, "reward": 0.10761390626430511, "reward_std": 0.03891483694314957, "rewards/bleu_reward_func/mean": 0.10761390626430511, "rewards/bleu_reward_func/std": 0.10792107880115509, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 29.75, "completions/mean_terminated_length": 29.75, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.96, "grad_norm": 16.023887634277344, "kl": 1.11669921875, "learning_rate": 1e-06, "loss": 0.1468, "num_tokens": 15619671.0, "reward": 0.3471377491950989, "reward_std": 0.05158979445695877, "rewards/bleu_reward_func/mean": 0.3471377491950989, "rewards/bleu_reward_func/std": 0.13238590955734253, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 325.5, "completions/mean_terminated_length": 213.60000610351562, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 0.9608, "grad_norm": 5.491018295288086, "kl": 0.32122802734375, "learning_rate": 1e-06, "loss": -0.2311, "num_tokens": 15634047.0, "reward": 0.07880916446447372, "reward_std": 0.030750975012779236, "rewards/bleu_reward_func/mean": 0.07880916446447372, "rewards/bleu_reward_func/std": 0.06850366294384003, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 264.5625, "completions/mean_terminated_length": 248.06668090820312, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.9616, "grad_norm": 4.983512878417969, "kl": 0.1092529296875, "learning_rate": 1e-06, "loss": -0.1154, "num_tokens": 15647369.0, "reward": 0.09226585179567337, "reward_std": 0.04809027165174484, "rewards/bleu_reward_func/mean": 0.09226585179567337, "rewards/bleu_reward_func/std": 0.14570128917694092, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 160.71875, "completions/mean_terminated_length": 160.71875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.9624, "grad_norm": 4.987871170043945, "kl": 0.1322021484375, "learning_rate": 1e-06, "loss": 0.0973, "num_tokens": 15656384.0, "reward": 0.036719270050525665, "reward_std": 0.007080578710883856, "rewards/bleu_reward_func/mean": 0.036719270050525665, "rewards/bleu_reward_func/std": 0.018336299806833267, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 238.03125, "completions/mean_terminated_length": 146.70834350585938, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.9632, "grad_norm": 4.710721492767334, "kl": 0.24249267578125, "learning_rate": 1e-06, "loss": -0.0163, "num_tokens": 15666769.0, "reward": 0.11304133385419846, "reward_std": 0.03101547807455063, "rewards/bleu_reward_func/mean": 0.11304133385419846, "rewards/bleu_reward_func/std": 0.09199430793523788, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 129.78125, "completions/mean_terminated_length": 129.78125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.964, "grad_norm": 9.284921646118164, "kl": 0.5504150390625, "learning_rate": 1e-06, "loss": -0.0268, "num_tokens": 15674122.0, "reward": 0.07591907680034637, "reward_std": 0.03484845906496048, "rewards/bleu_reward_func/mean": 0.07591907680034637, "rewards/bleu_reward_func/std": 0.05640895664691925, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 280.15625, "completions/mean_terminated_length": 158.71429443359375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9648, "grad_norm": 5.940964698791504, "kl": 0.234375, "learning_rate": 1e-06, "loss": 0.2716, "num_tokens": 15690119.0, "reward": 0.19026660919189453, "reward_std": 0.06492812931537628, "rewards/bleu_reward_func/mean": 0.19026660919189453, "rewards/bleu_reward_func/std": 0.1680937260389328, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 190.96875, "completions/mean_terminated_length": 131.51852416992188, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9656, "grad_norm": 4.697986602783203, "kl": 0.17156982421875, "learning_rate": 1e-06, "loss": 0.1839, "num_tokens": 15700302.0, "reward": 0.29912805557250977, "reward_std": 0.05129002407193184, "rewards/bleu_reward_func/mean": 0.29912805557250977, "rewards/bleu_reward_func/std": 0.33928823471069336, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 403.34375, "completions/mean_terminated_length": 346.4285888671875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.9664, "grad_norm": 3.426858901977539, "kl": 0.05426025390625, "learning_rate": 1e-06, "loss": -0.041, "num_tokens": 15718081.0, "reward": 0.10875709354877472, "reward_std": 0.017351722344756126, "rewards/bleu_reward_func/mean": 0.10875709354877472, "rewards/bleu_reward_func/std": 0.1039399579167366, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 236.59375, "completions/mean_terminated_length": 111.40909576416016, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.9672, "grad_norm": 5.4925079345703125, "kl": 0.2030029296875, "learning_rate": 1e-06, "loss": -0.1031, "num_tokens": 15728028.0, "reward": 0.04499085620045662, "reward_std": 0.01842951774597168, "rewards/bleu_reward_func/mean": 0.04499085620045662, "rewards/bleu_reward_func/std": 0.030968643724918365, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 128.64515686035156, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.968, "grad_norm": 9.575289726257324, "kl": 0.20196533203125, "learning_rate": 1e-06, "loss": 0.1409, "num_tokens": 15735296.0, "reward": 0.07829822599887848, "reward_std": 0.03164747357368469, "rewards/bleu_reward_func/mean": 0.07829822599887848, "rewards/bleu_reward_func/std": 0.07768744975328445, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 338.1875, "completions/mean_terminated_length": 298.0769348144531, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.9688, "grad_norm": 2.519958019256592, "kl": 0.042205810546875, "learning_rate": 1e-06, "loss": 0.2122, "num_tokens": 15750286.0, "reward": 0.11583074182271957, "reward_std": 0.0300702303647995, "rewards/bleu_reward_func/mean": 0.11583074182271957, "rewards/bleu_reward_func/std": 0.0626337081193924, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 202.1875, "completions/mean_terminated_length": 98.91667175292969, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9696, "grad_norm": 7.297961235046387, "kl": 0.3997802734375, "learning_rate": 1e-06, "loss": 0.1691, "num_tokens": 15759468.0, "reward": 0.14121456444263458, "reward_std": 0.051856689155101776, "rewards/bleu_reward_func/mean": 0.14121456444263458, "rewards/bleu_reward_func/std": 0.1731884926557541, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 183.1875, "completions/mean_terminated_length": 33.727272033691406, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9704, "grad_norm": 6.750447750091553, "kl": 0.354522705078125, "learning_rate": 1e-06, "loss": 0.3063, "num_tokens": 15771298.0, "reward": 0.20002232491970062, "reward_std": 0.05593840777873993, "rewards/bleu_reward_func/mean": 0.20002232491970062, "rewards/bleu_reward_func/std": 0.1818784922361374, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 155.59375, "completions/mean_terminated_length": 155.59375, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.9712, "grad_norm": 8.60672378540039, "kl": 0.29888916015625, "learning_rate": 1e-06, "loss": -0.1436, "num_tokens": 15781949.0, "reward": 0.07949218153953552, "reward_std": 0.030030012130737305, "rewards/bleu_reward_func/mean": 0.07949218153953552, "rewards/bleu_reward_func/std": 0.05354390665888786, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 193.4545440673828, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.972, "grad_norm": 4.160828590393066, "kl": 0.09783935546875, "learning_rate": 1e-06, "loss": -0.0156, "num_tokens": 15795061.0, "reward": 0.1600840538740158, "reward_std": 0.05235850065946579, "rewards/bleu_reward_func/mean": 0.1600840538740158, "rewards/bleu_reward_func/std": 0.0939527377486229, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 148.8125, "completions/mean_terminated_length": 65.0, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9728, "grad_norm": 5.613617420196533, "kl": 0.1966552734375, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 15804343.0, "reward": 0.1325000822544098, "reward_std": 0.054465532302856445, "rewards/bleu_reward_func/mean": 0.1325000822544098, "rewards/bleu_reward_func/std": 0.15841807425022125, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 118.34375, "completions/mean_terminated_length": 118.34375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9736, "grad_norm": 7.5834503173828125, "kl": 0.2967529296875, "learning_rate": 1e-06, "loss": 0.0808, "num_tokens": 15815658.0, "reward": 0.243885338306427, "reward_std": 0.05274055525660515, "rewards/bleu_reward_func/mean": 0.243885338306427, "rewards/bleu_reward_func/std": 0.14211414754390717, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 367.8125, "completions/mean_terminated_length": 223.625, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9744, "grad_norm": 3.8433821201324463, "kl": 0.0970458984375, "learning_rate": 1e-06, "loss": -0.0201, "num_tokens": 15831204.0, "reward": 0.10006400942802429, "reward_std": 0.02605537325143814, "rewards/bleu_reward_func/mean": 0.10006400942802429, "rewards/bleu_reward_func/std": 0.1093517392873764, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 191.90625, "completions/mean_terminated_length": 132.62962341308594, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "epoch": 0.9752, "grad_norm": 25.382192611694336, "kl": 0.375518798828125, "learning_rate": 1e-06, "loss": 0.2354, "num_tokens": 15842769.0, "reward": 0.21496494114398956, "reward_std": 0.08334603905677795, "rewards/bleu_reward_func/mean": 0.21496494114398956, "rewards/bleu_reward_func/std": 0.3287891745567322, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 68.17391204833984, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.976, "grad_norm": 8.422406196594238, "kl": 0.158935546875, "learning_rate": 1e-06, "loss": 0.7192, "num_tokens": 15852937.0, "reward": 0.21234184503555298, "reward_std": 0.10839352756738663, "rewards/bleu_reward_func/mean": 0.21234184503555298, "rewards/bleu_reward_func/std": 0.17191235721111298, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 291.15625, "completions/mean_terminated_length": 250.25926208496094, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.9768, "grad_norm": 3.8467891216278076, "kl": 0.101806640625, "learning_rate": 1e-06, "loss": -0.0472, "num_tokens": 15865934.0, "reward": 0.13597853481769562, "reward_std": 0.034184906631708145, "rewards/bleu_reward_func/mean": 0.13597853481769562, "rewards/bleu_reward_func/std": 0.0799630656838417, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 268.15625, "completions/mean_terminated_length": 223.0, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.9776, "grad_norm": 5.5957794189453125, "kl": 0.1470947265625, "learning_rate": 1e-06, "loss": 0.137, "num_tokens": 15881691.0, "reward": 0.11758720874786377, "reward_std": 0.05352931469678879, "rewards/bleu_reward_func/mean": 0.11758720874786377, "rewards/bleu_reward_func/std": 0.08839549124240875, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 364.28125, "completions/mean_terminated_length": 174.35714721679688, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.9784, "grad_norm": 3.5430212020874023, "kl": 0.0980224609375, "learning_rate": 1e-06, "loss": -0.0829, "num_tokens": 15901716.0, "reward": 0.10303943604230881, "reward_std": 0.02435348369181156, "rewards/bleu_reward_func/mean": 0.10303943604230881, "rewards/bleu_reward_func/std": 0.10681937634944916, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 350.09375, "completions/mean_terminated_length": 207.23529052734375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.9792, "grad_norm": 3.817512035369873, "kl": 0.06549072265625, "learning_rate": 1e-06, "loss": -0.2731, "num_tokens": 15916239.0, "reward": 0.025896022096276283, "reward_std": 0.011423053219914436, "rewards/bleu_reward_func/mean": 0.025896022096276283, "rewards/bleu_reward_func/std": 0.01915143057703972, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 512.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 148.59375, "completions/mean_terminated_length": 27.45833396911621, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.98, "grad_norm": 8.734639167785645, "kl": 0.29150390625, "learning_rate": 1e-06, "loss": 0.1203, "num_tokens": 15926002.0, "reward": 0.2994440793991089, "reward_std": 0.08188341557979584, "rewards/bleu_reward_func/mean": 0.2994440793991089, "rewards/bleu_reward_func/std": 0.17080959677696228, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 109.10344696044922, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.9808, "grad_norm": 6.3398637771606445, "kl": 0.203369140625, "learning_rate": 1e-06, "loss": 0.0948, "num_tokens": 15934822.0, "reward": 0.08409433811903, "reward_std": 0.022457323968410492, "rewards/bleu_reward_func/mean": 0.08409433811903, "rewards/bleu_reward_func/std": 0.12822076678276062, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 127.15625, "completions/mean_terminated_length": 114.74193572998047, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9816, "grad_norm": 5.545539379119873, "kl": 0.157470703125, "learning_rate": 1e-06, "loss": -0.038, "num_tokens": 15941411.0, "reward": 0.291486918926239, "reward_std": 0.01802459917962551, "rewards/bleu_reward_func/mean": 0.291486918926239, "rewards/bleu_reward_func/std": 0.30036208033561707, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 512.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 188.1875, "completions/mean_terminated_length": 128.22222900390625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9824, "grad_norm": 4.385817050933838, "kl": 0.12310791015625, "learning_rate": 1e-06, "loss": -0.2018, "num_tokens": 15948961.0, "reward": 0.050382573157548904, "reward_std": 0.019635431468486786, "rewards/bleu_reward_func/mean": 0.050382573157548904, "rewards/bleu_reward_func/std": 0.0410877950489521, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 167.40625, "completions/mean_terminated_length": 131.7586212158203, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.9832, "grad_norm": 6.975030422210693, "kl": 0.20166015625, "learning_rate": 1e-06, "loss": 0.0432, "num_tokens": 15959414.0, "reward": 0.10200367867946625, "reward_std": 0.012780029326677322, "rewards/bleu_reward_func/mean": 0.10200367867946625, "rewards/bleu_reward_func/std": 0.07971282303333282, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 149.60000610351562, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.984, "grad_norm": 58.40047836303711, "kl": 0.6114501953125, "learning_rate": 1e-06, "loss": 0.1056, "num_tokens": 15970374.0, "reward": 0.21096709370613098, "reward_std": 0.05436326563358307, "rewards/bleu_reward_func/mean": 0.21096709370613098, "rewards/bleu_reward_func/std": 0.21129880845546722, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 512.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 53.75, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9848, "grad_norm": 7.031259536743164, "kl": 0.173828125, "learning_rate": 1e-06, "loss": -0.0097, "num_tokens": 15985058.0, "reward": 0.13087573647499084, "reward_std": 0.03848683089017868, "rewards/bleu_reward_func/mean": 0.13087573647499084, "rewards/bleu_reward_func/std": 0.07179337739944458, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 347.21875, "completions/mean_terminated_length": 282.7391357421875, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9856, "grad_norm": 13.313501358032227, "kl": 0.432952880859375, "learning_rate": 1e-06, "loss": 0.1537, "num_tokens": 16003825.0, "reward": 0.06137411668896675, "reward_std": 0.036481164395809174, "rewards/bleu_reward_func/mean": 0.06137411668896675, "rewards/bleu_reward_func/std": 0.05317319929599762, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 114.03125, "completions/mean_terminated_length": 87.50000762939453, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9864, "grad_norm": 12.291428565979004, "kl": 0.4300537109375, "learning_rate": 1e-06, "loss": 0.2836, "num_tokens": 16012714.0, "reward": 0.05898230895400047, "reward_std": 0.024662408977746964, "rewards/bleu_reward_func/mean": 0.05898230895400047, "rewards/bleu_reward_func/std": 0.05822930857539177, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 512.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 255.46875, "completions/mean_terminated_length": 155.0869598388672, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.9872, "grad_norm": 6.375350475311279, "kl": 0.20550537109375, "learning_rate": 1e-06, "loss": -0.0603, "num_tokens": 16023033.0, "reward": 0.12795159220695496, "reward_std": 0.034560851752758026, "rewards/bleu_reward_func/mean": 0.12795159220695496, "rewards/bleu_reward_func/std": 0.05310589075088501, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 198.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.988, "grad_norm": 2.701765775680542, "kl": 0.1783447265625, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 16037525.0, "reward": 0.08859970420598984, "reward_std": 0.012333719059824944, "rewards/bleu_reward_func/mean": 0.08859970420598984, "rewards/bleu_reward_func/std": 0.05836745351552963, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 368.5625, "completions/mean_terminated_length": 282.5, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 0.9888, "grad_norm": 3.917327642440796, "kl": 0.1302490234375, "learning_rate": 1e-06, "loss": -0.3326, "num_tokens": 16051695.0, "reward": 0.055808089673519135, "reward_std": 0.020165979862213135, "rewards/bleu_reward_func/mean": 0.055808089673519135, "rewards/bleu_reward_func/std": 0.05799167603254318, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 258.4375, "completions/mean_terminated_length": 187.44000244140625, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.9896, "grad_norm": 11.650849342346191, "kl": 0.3966064453125, "learning_rate": 1e-06, "loss": -0.1616, "num_tokens": 16063021.0, "reward": 0.09570951759815216, "reward_std": 0.041778795421123505, "rewards/bleu_reward_func/mean": 0.09570951759815216, "rewards/bleu_reward_func/std": 0.09836214780807495, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 130.71875, "completions/mean_terminated_length": 130.71875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.9904, "grad_norm": 6.946449279785156, "kl": 0.3614501953125, "learning_rate": 1e-06, "loss": 0.0432, "num_tokens": 16073188.0, "reward": 0.19768103957176208, "reward_std": 0.05326389521360397, "rewards/bleu_reward_func/mean": 0.19768103957176208, "rewards/bleu_reward_func/std": 0.14096693694591522, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 153.59375, "completions/mean_terminated_length": 53.23999786376953, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.9912, "grad_norm": 15.68226432800293, "kl": 0.60394287109375, "learning_rate": 1e-06, "loss": 0.1991, "num_tokens": 16089367.0, "reward": 0.19772392511367798, "reward_std": 0.04295985400676727, "rewards/bleu_reward_func/mean": 0.19772392511367798, "rewards/bleu_reward_func/std": 0.15457068383693695, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 202.1875, "completions/mean_terminated_length": 181.53334045410156, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.992, "grad_norm": 5.087695121765137, "kl": 0.21142578125, "learning_rate": 1e-06, "loss": -0.1565, "num_tokens": 16097533.0, "reward": 0.04568080976605415, "reward_std": 0.0273725725710392, "rewards/bleu_reward_func/mean": 0.04568080976605415, "rewards/bleu_reward_func/std": 0.05127081274986267, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 512.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 98.1875, "completions/mean_terminated_length": 55.379310607910156, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.9928, "grad_norm": 9.800594329833984, "kl": 0.535400390625, "learning_rate": 1e-06, "loss": -0.2267, "num_tokens": 16104579.0, "reward": 0.1168278306722641, "reward_std": 0.04581147059798241, "rewards/bleu_reward_func/mean": 0.1168278306722641, "rewards/bleu_reward_func/std": 0.08855386078357697, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 512.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 48.38888931274414, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "epoch": 0.9936, "grad_norm": 6.915892124176025, "kl": 0.300262451171875, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 16117106.0, "reward": 0.21942071616649628, "reward_std": 0.06735092401504517, "rewards/bleu_reward_func/mean": 0.21942071616649628, "rewards/bleu_reward_func/std": 0.1295205056667328, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 512.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 386.9375, "completions/mean_terminated_length": 311.8999938964844, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 0.9944, "grad_norm": 11.362247467041016, "kl": 0.95458984375, "learning_rate": 1e-06, "loss": -0.2592, "num_tokens": 16131824.0, "reward": 0.03232087939977646, "reward_std": 0.018025288358330727, "rewards/bleu_reward_func/mean": 0.03232087939977646, "rewards/bleu_reward_func/std": 0.026756620034575462, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 229.40625, "completions/mean_terminated_length": 220.29031372070312, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "epoch": 0.9952, "grad_norm": 15.675792694091797, "kl": 0.33203125, "learning_rate": 1e-06, "loss": 0.0615, "num_tokens": 16145325.0, "reward": 0.08530285954475403, "reward_std": 0.03364046663045883, "rewards/bleu_reward_func/mean": 0.08530285954475403, "rewards/bleu_reward_func/std": 0.06811228394508362, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 426.5625, "completions/mean_terminated_length": 381.8095397949219, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.996, "grad_norm": 2.0989584922790527, "kl": 0.0555419921875, "learning_rate": 1e-06, "loss": -0.0865, "num_tokens": 16164487.0, "reward": 0.10918224602937698, "reward_std": 0.043439704924821854, "rewards/bleu_reward_func/mean": 0.10918224602937698, "rewards/bleu_reward_func/std": 0.09625791013240814, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 127.90625, "completions/mean_terminated_length": 115.51612854003906, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.9968, "grad_norm": 8.359559059143066, "kl": 0.6103515625, "learning_rate": 1e-06, "loss": 0.2382, "num_tokens": 16172076.0, "reward": 0.0722852572798729, "reward_std": 0.0363241545855999, "rewards/bleu_reward_func/mean": 0.0722852572798729, "rewards/bleu_reward_func/std": 0.05653948336839676, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 63.21875, "completions/mean_terminated_length": 63.21875, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9976, "grad_norm": 14.668207168579102, "kl": 0.6405029296875, "learning_rate": 1e-06, "loss": 0.1996, "num_tokens": 16178059.0, "reward": 0.11710416525602341, "reward_std": 0.044295113533735275, "rewards/bleu_reward_func/mean": 0.11710416525602341, "rewards/bleu_reward_func/std": 0.06186880171298981, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 174.32257080078125, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.9984, "grad_norm": 9.499897956848145, "kl": 0.4058837890625, "learning_rate": 1e-06, "loss": -0.1047, "num_tokens": 16190527.0, "reward": 0.14165818691253662, "reward_std": 0.042991265654563904, "rewards/bleu_reward_func/mean": 0.14165818691253662, "rewards/bleu_reward_func/std": 0.1511020064353943, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 168.92308044433594, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.9992, "grad_norm": 6.786505222320557, "kl": 0.34844970703125, "learning_rate": 1e-06, "loss": -0.0929, "num_tokens": 16203199.0, "reward": 0.14652788639068604, "reward_std": 0.05133647471666336, "rewards/bleu_reward_func/mean": 0.14652788639068604, "rewards/bleu_reward_func/std": 0.18619418144226074, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 39.75, "completions/mean_terminated_length": 39.75, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "epoch": 1.0, "grad_norm": 9.613625526428223, "kl": 0.37677001953125, "learning_rate": 1e-06, "loss": 0.061, "num_tokens": 16214113.0, "reward": 0.10052811354398727, "reward_std": 0.05825551599264145, "rewards/bleu_reward_func/mean": 0.10052811354398727, "rewards/bleu_reward_func/std": 0.10802065581083298, "step": 1250 } ], "logging_steps": 1, "max_steps": 1250, "num_input_tokens_seen": 16214113, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }