{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17142857142857143, "eval_steps": 500, "global_step": 150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 1702.03125, "completions/mean_terminated_length": 993.6190795898438, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.001142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2544386684894562, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 118418.0, "reward": 0.17899775505065918, "reward_std": 0.7650213241577148, "rewards/cosine_scaled_reward/mean": -0.09800112992525101, "rewards/cosine_scaled_reward/std": 0.37953105568885803, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.48795005679130554, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 1738.90625, "completions/mean_terminated_length": 949.0, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.002285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2436082512140274, "learning_rate": 5e-08, "loss": -0.0, "num_tokens": 239748.0, "reward": 0.3848632574081421, "reward_std": 0.9111153483390808, "rewards/cosine_scaled_reward/mean": 0.020556632429361343, "rewards/cosine_scaled_reward/std": 0.4492928683757782, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4787135720252991, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1989.015625, "completions/mean_terminated_length": 1104.25, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.0034285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.2544717788696289, "learning_rate": 1e-07, "loss": -0.0, "num_tokens": 377517.0, "reward": -0.3279358148574829, "reward_std": 0.33216947317123413, "rewards/cosine_scaled_reward/mean": -0.20303040742874146, "rewards/cosine_scaled_reward/std": 0.179075226187706, "rewards/format_reward/mean": 0.078125, "rewards/format_reward/std": 0.27048972249031067, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1566.421875, "completions/mean_terminated_length": 1084.84375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.004571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.28807103633880615, "learning_rate": 1.5e-07, "loss": -0.0, "num_tokens": 487576.0, "reward": 0.2716121971607208, "reward_std": 0.6643469333648682, "rewards/cosine_scaled_reward/mean": -0.12981891632080078, "rewards/cosine_scaled_reward/std": 0.3019586503505707, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5029674172401428, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 1936.84375, "completions/mean_terminated_length": 1031.71435546875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.26783761382102966, "learning_rate": 2e-07, "loss": -0.0, "num_tokens": 622350.0, "reward": -0.3612896800041199, "reward_std": 0.41048353910446167, "rewards/cosine_scaled_reward/mean": -0.23533234000205994, "rewards/cosine_scaled_reward/std": 0.20467400550842285, "rewards/format_reward/mean": 0.109375, "rewards/format_reward/std": 0.3145764470100403, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 1889.453125, "completions/mean_terminated_length": 779.625, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.006857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.262518972158432, "learning_rate": 2.5e-07, "loss": 0.0, "num_tokens": 754923.0, "reward": -0.29250282049179077, "reward_std": 0.5422531962394714, "rewards/cosine_scaled_reward/mean": -0.22437641024589539, "rewards/cosine_scaled_reward/std": 0.22509199380874634, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.36596253514289856, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 1921.921875, "completions/mean_terminated_length": 1314.45458984375, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.22601397335529327, "learning_rate": 3e-07, "loss": 0.0, "num_tokens": 888334.0, "reward": 0.025340259075164795, "reward_std": 0.7285393476486206, "rewards/cosine_scaled_reward/mean": -0.1279548704624176, "rewards/cosine_scaled_reward/std": 0.40222346782684326, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4531635046005249, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1736.859375, "completions/mean_terminated_length": 999.9473876953125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.009142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.24552854895591736, "learning_rate": 3.5e-07, "loss": 0.0, "num_tokens": 1009909.0, "reward": 0.21729671955108643, "reward_std": 0.6989120244979858, "rewards/cosine_scaled_reward/mean": -0.055414143949747086, "rewards/cosine_scaled_reward/std": 0.47493892908096313, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.4732423722743988, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1967.53125, "completions/mean_terminated_length": 1475.77783203125, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 0.010285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.2430322915315628, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 1147287.0, "reward": -0.21451422572135925, "reward_std": 0.587526798248291, "rewards/cosine_scaled_reward/mean": -0.19319462776184082, "rewards/cosine_scaled_reward/std": 0.29357606172561646, "rewards/format_reward/mean": 0.171875, "rewards/format_reward/std": 0.38025420904159546, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1708.546875, "completions/mean_terminated_length": 961.75, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2543582320213318, "learning_rate": 4.5e-07, "loss": 0.0, "num_tokens": 1267466.0, "reward": 0.02539752423763275, "reward_std": 0.545810341835022, "rewards/cosine_scaled_reward/mean": -0.14355123043060303, "rewards/cosine_scaled_reward/std": 0.36147356033325195, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.467176616191864, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 1967.734375, "completions/mean_terminated_length": 1191.8333740234375, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.012571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.24583907425403595, "learning_rate": 5e-07, "loss": -0.0, "num_tokens": 1405073.0, "reward": -0.46971434354782104, "reward_std": 0.36104393005371094, "rewards/cosine_scaled_reward/mean": -0.28173214197158813, "rewards/cosine_scaled_reward/std": 0.17775526642799377, "rewards/format_reward/mean": 0.09375, "rewards/format_reward/std": 0.29378482699394226, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 1707.5625, "completions/mean_terminated_length": 1176.47998046875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.013714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.3135142922401428, "learning_rate": 5.5e-07, "loss": -0.0, "num_tokens": 1525301.0, "reward": 0.0018395520746707916, "reward_std": 0.7012988328933716, "rewards/cosine_scaled_reward/mean": -0.21783021092414856, "rewards/cosine_scaled_reward/std": 0.324150949716568, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 1841.96875, "completions/mean_terminated_length": 1168.933349609375, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.014857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2532394826412201, "learning_rate": 6e-07, "loss": -0.0, "num_tokens": 1654227.0, "reward": -0.10322706401348114, "reward_std": 0.6915165185928345, "rewards/cosine_scaled_reward/mean": -0.17661353945732117, "rewards/cosine_scaled_reward/std": 0.329875111579895, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1816.390625, "completions/mean_terminated_length": 1306.8499755859375, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.28405147790908813, "learning_rate": 6.5e-07, "loss": 0.0, "num_tokens": 1781084.0, "reward": 0.10602855682373047, "reward_std": 0.630502462387085, "rewards/cosine_scaled_reward/mean": -0.11104822158813477, "rewards/cosine_scaled_reward/std": 0.3846627473831177, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.4732423722743988, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 1702.109375, "completions/mean_terminated_length": 818.1666870117188, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.28779250383377075, "learning_rate": 7e-07, "loss": 0.0, "num_tokens": 1900939.0, "reward": 0.32734519243240356, "reward_std": 0.3870265483856201, "rewards/cosine_scaled_reward/mean": 0.007422588765621185, "rewards/cosine_scaled_reward/std": 0.45787373185157776, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.467176616191864, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.018285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.2337152510881424, "learning_rate": 7.5e-07, "loss": -0.0, "num_tokens": 2042451.0, "reward": -0.5429925918579102, "reward_std": 0.3153150975704193, "rewards/cosine_scaled_reward/mean": -0.2714962661266327, "rewards/cosine_scaled_reward/std": 0.1678173691034317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 1564.921875, "completions/mean_terminated_length": 858.8846435546875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.019428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.33599403500556946, "learning_rate": 8e-07, "loss": -0.0, "num_tokens": 2153126.0, "reward": 0.17696775496006012, "reward_std": 0.6489306688308716, "rewards/cosine_scaled_reward/mean": -0.11464111506938934, "rewards/cosine_scaled_reward/std": 0.3551919758319855, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.49501484632492065, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 1795.390625, "completions/mean_terminated_length": 893.21435546875, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.02057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.22697053849697113, "learning_rate": 8.499999999999999e-07, "loss": -0.0, "num_tokens": 2278407.0, "reward": -0.10711958259344101, "reward_std": 0.5238703489303589, "rewards/cosine_scaled_reward/mean": -0.1785597801208496, "rewards/cosine_scaled_reward/std": 0.2545098662376404, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 1921.484375, "completions/mean_terminated_length": 1238.300048828125, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.021714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23972108960151672, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 2412638.0, "reward": 0.029344379901885986, "reward_std": 0.6719281077384949, "rewards/cosine_scaled_reward/mean": -0.086890310049057, "rewards/cosine_scaled_reward/std": 0.40220555663108826, "rewards/format_reward/mean": 0.203125, "rewards/format_reward/std": 0.40550529956817627, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1728.5625, "completions/mean_terminated_length": 845.4117431640625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.23309311270713806, "learning_rate": 9.499999999999999e-07, "loss": 0.0, "num_tokens": 2534618.0, "reward": 0.0131673663854599, "reward_std": 0.4436222314834595, "rewards/cosine_scaled_reward/mean": -0.13404130935668945, "rewards/cosine_scaled_reward/std": 0.32819250226020813, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4531635046005249, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 1777.953125, "completions/mean_terminated_length": 1087.8333740234375, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.29990270733833313, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 2659215.0, "reward": -0.1764472872018814, "reward_std": 0.5121938586235046, "rewards/cosine_scaled_reward/mean": -0.2444736361503601, "rewards/cosine_scaled_reward/std": 0.289971262216568, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.467176616191864, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 1361.28125, "completions/mean_terminated_length": 921.0769653320312, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.025142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.29922786355018616, "learning_rate": 9.99931462820376e-07, "loss": -0.0, "num_tokens": 2755353.0, "reward": 0.6089149713516235, "reward_std": 0.5986809730529785, "rewards/cosine_scaled_reward/mean": -0.05491749942302704, "rewards/cosine_scaled_reward/std": 0.39076483249664307, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1565.046875, "completions/mean_terminated_length": 903.2222290039062, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.026285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.27512773871421814, "learning_rate": 9.997258721585931e-07, "loss": -0.0, "num_tokens": 2866308.0, "reward": 0.21871733665466309, "reward_std": 0.5976030826568604, "rewards/cosine_scaled_reward/mean": -0.10157884657382965, "rewards/cosine_scaled_reward/std": 0.3856185972690582, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.49776285886764526, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1801.671875, "completions/mean_terminated_length": 1259.75, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.027428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.22642865777015686, "learning_rate": 9.993832906395582e-07, "loss": -0.0, "num_tokens": 2992543.0, "reward": 0.04899948835372925, "reward_std": 0.8525694608688354, "rewards/cosine_scaled_reward/mean": -0.17081275582313538, "rewards/cosine_scaled_reward/std": 0.3993513882160187, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.4917473793029785, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1715.765625, "completions/mean_terminated_length": 1035.4761962890625, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.25316134095191956, "learning_rate": 9.989038226169207e-07, "loss": -0.0, "num_tokens": 3112648.0, "reward": 0.10585837811231613, "reward_std": 0.7828943729400635, "rewards/cosine_scaled_reward/mean": -0.11894579976797104, "rewards/cosine_scaled_reward/std": 0.4141720235347748, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4787135720252991, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1917.703125, "completions/mean_terminated_length": 1452.357177734375, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 0.029714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2521306574344635, "learning_rate": 9.982876141412855e-07, "loss": -0.0, "num_tokens": 3246013.0, "reward": 0.17620250582695007, "reward_std": 0.6548349857330322, "rewards/cosine_scaled_reward/mean": -0.08377375453710556, "rewards/cosine_scaled_reward/std": 0.3527655303478241, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4787135720252991, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1851.015625, "completions/mean_terminated_length": 1147.5, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.030857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2730060815811157, "learning_rate": 9.975348529157229e-07, "loss": -0.0, "num_tokens": 3374766.0, "reward": -0.18854813277721405, "reward_std": 0.49348777532577515, "rewards/cosine_scaled_reward/mean": -0.21146157383918762, "rewards/cosine_scaled_reward/std": 0.2601618766784668, "rewards/format_reward/mean": 0.234375, "rewards/format_reward/std": 0.42695629596710205, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1798.328125, "completions/mean_terminated_length": 1049.3125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.2566036880016327, "learning_rate": 9.96645768238595e-07, "loss": 0.0, "num_tokens": 3500195.0, "reward": 0.06705980002880096, "reward_std": 0.7090284824371338, "rewards/cosine_scaled_reward/mean": -0.10709509253501892, "rewards/cosine_scaled_reward/std": 0.4101051986217499, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4531635046005249, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1930.203125, "completions/mean_terminated_length": 1210.3333740234375, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.03314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25197461247444153, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "num_tokens": 3634200.0, "reward": -0.2462695688009262, "reward_std": 0.5237302780151367, "rewards/cosine_scaled_reward/mean": -0.2012597918510437, "rewards/cosine_scaled_reward/std": 0.23252712190151215, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.36596253514289856, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 1847.65625, "completions/mean_terminated_length": 1061.6923828125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.30431485176086426, "learning_rate": 9.944597532678119e-07, "loss": 0.0, "num_tokens": 3762986.0, "reward": -0.05392302945256233, "reward_std": 0.7249555587768555, "rewards/cosine_scaled_reward/mean": -0.15196150541305542, "rewards/cosine_scaled_reward/std": 0.34566983580589294, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 1838.671875, "completions/mean_terminated_length": 931.5833740234375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.03542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2484513372182846, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "num_tokens": 3891157.0, "reward": -0.11271396279335022, "reward_std": 0.6705260872840881, "rewards/cosine_scaled_reward/mean": -0.1813569962978363, "rewards/cosine_scaled_reward/std": 0.4071698486804962, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 1910.109375, "completions/mean_terminated_length": 1417.6429443359375, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.036571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.25329527258872986, "learning_rate": 9.917322325514487e-07, "loss": -0.0, "num_tokens": 4023756.0, "reward": -0.08931556344032288, "reward_std": 0.6381070613861084, "rewards/cosine_scaled_reward/mean": -0.16965776681900024, "rewards/cosine_scaled_reward/std": 0.37385129928588867, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 2023.71875, "completions/mean_terminated_length": 1530.0, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "epoch": 0.037714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.22758109867572784, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "num_tokens": 4164490.0, "reward": -0.4589868187904358, "reward_std": 0.5177067518234253, "rewards/cosine_scaled_reward/mean": -0.2919934093952179, "rewards/cosine_scaled_reward/std": 0.2252870500087738, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3333333432674408, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1454.78125, "completions/mean_terminated_length": 963.2571411132812, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.038857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.3234354257583618, "learning_rate": 9.88466529153356e-07, "loss": 0.0, "num_tokens": 4267148.0, "reward": 0.656031608581543, "reward_std": 0.7529654502868652, "rewards/cosine_scaled_reward/mean": 0.05457830801606178, "rewards/cosine_scaled_reward/std": 0.49684229493141174, "rewards/format_reward/mean": 0.546875, "rewards/format_reward/std": 0.501733124256134, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 1819.078125, "completions/mean_terminated_length": 716.0909423828125, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.2821458876132965, "learning_rate": 9.866330768241983e-07, "loss": -0.0, "num_tokens": 4395065.0, "reward": -0.09630556404590607, "reward_std": 0.7089139223098755, "rewards/cosine_scaled_reward/mean": -0.15752778947353363, "rewards/cosine_scaled_reward/std": 0.3647947609424591, "rewards/format_reward/mean": 0.21875, "rewards/format_reward/std": 0.4166666865348816, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 1954.34375, "completions/mean_terminated_length": 1382.0, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.04114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.24163897335529327, "learning_rate": 9.846666218300807e-07, "loss": -0.0, "num_tokens": 4531255.0, "reward": -0.34593287110328674, "reward_std": 0.44493502378463745, "rewards/cosine_scaled_reward/mean": -0.24327893555164337, "rewards/cosine_scaled_reward/std": 0.24784433841705322, "rewards/format_reward/mean": 0.140625, "rewards/format_reward/std": 0.3503824472427368, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 1868.921875, "completions/mean_terminated_length": 1092.916748046875, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.04228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.24795544147491455, "learning_rate": 9.825677631722435e-07, "loss": -0.0, "num_tokens": 4661890.0, "reward": -0.23053905367851257, "reward_std": 0.34036368131637573, "rewards/cosine_scaled_reward/mean": -0.2246445268392563, "rewards/cosine_scaled_reward/std": 0.15942412614822388, "rewards/format_reward/mean": 0.21875, "rewards/format_reward/std": 0.4166666865348816, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 1889.53125, "completions/mean_terminated_length": 1033.800048828125, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.04342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24283826351165771, "learning_rate": 9.80337140183366e-07, "loss": 0.0, "num_tokens": 4794532.0, "reward": -0.10043507814407349, "reward_std": 0.47925832867622375, "rewards/cosine_scaled_reward/mean": -0.13615503907203674, "rewards/cosine_scaled_reward/std": 0.3336707651615143, "rewards/format_reward/mean": 0.171875, "rewards/format_reward/std": 0.38025420904159546, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 1644.828125, "completions/mean_terminated_length": 689.9473876953125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.044571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.28362998366355896, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "num_tokens": 4910585.0, "reward": 0.12284853309392929, "reward_std": 0.4183085858821869, "rewards/cosine_scaled_reward/mean": -0.11045074462890625, "rewards/cosine_scaled_reward/std": 0.30217844247817993, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4787135720252991, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 1618.28125, "completions/mean_terminated_length": 902.0833740234375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.262617826461792, "learning_rate": 9.754833590196926e-07, "loss": 0.0, "num_tokens": 5024227.0, "reward": 0.2076582908630371, "reward_std": 0.42125773429870605, "rewards/cosine_scaled_reward/mean": -0.12273336946964264, "rewards/cosine_scaled_reward/std": 0.4404613971710205, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.501733124256134, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 1717.734375, "completions/mean_terminated_length": 1235.0384521484375, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.046857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.23294499516487122, "learning_rate": 9.728616793536587e-07, "loss": -0.0, "num_tokens": 5145314.0, "reward": 0.011502981185913086, "reward_std": 0.6816084980964661, "rewards/cosine_scaled_reward/mean": -0.22081100940704346, "rewards/cosine_scaled_reward/std": 0.37589573860168457, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.501733124256134, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 1703.921875, "completions/mean_terminated_length": 579.933349609375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.34672290086746216, "learning_rate": 9.701111919237408e-07, "loss": -0.0, "num_tokens": 5264725.0, "reward": -0.2616002857685089, "reward_std": 0.37952175736427307, "rewards/cosine_scaled_reward/mean": -0.26361262798309326, "rewards/cosine_scaled_reward/std": 0.17531204223632812, "rewards/format_reward/mean": 0.265625, "rewards/format_reward/std": 0.44515693187713623, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 1681.84375, "completions/mean_terminated_length": 814.631591796875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.04914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.263967901468277, "learning_rate": 9.672327345550543e-07, "loss": -0.0, "num_tokens": 5383979.0, "reward": 0.13376155495643616, "reward_std": 0.46012288331985474, "rewards/cosine_scaled_reward/mean": -0.08155670762062073, "rewards/cosine_scaled_reward/std": 0.3612325191497803, "rewards/format_reward/mean": 0.296875, "rewards/format_reward/std": 0.4604927599430084, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1624.625, "completions/mean_terminated_length": 869.9130859375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.05028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.28927963972091675, "learning_rate": 9.64227184053598e-07, "loss": -0.0, "num_tokens": 5498651.0, "reward": 0.20869271457195282, "reward_std": 0.5558150410652161, "rewards/cosine_scaled_reward/mean": -0.0987786278128624, "rewards/cosine_scaled_reward/std": 0.42912590503692627, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.49501484632492065, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 2006.96875, "completions/mean_terminated_length": 1522.800048828125, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 0.05142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24254000186920166, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "num_tokens": 5638753.0, "reward": -0.2540697157382965, "reward_std": 0.4600578844547272, "rewards/cosine_scaled_reward/mean": -0.20515984296798706, "rewards/cosine_scaled_reward/std": 0.3251590430736542, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.36596253514289856, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 1765.984375, "completions/mean_terminated_length": 919.9375, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.052571428571428575, "frac_reward_zero_std": 0.0, "grad_norm": 0.2645930349826813, "learning_rate": 9.578385041664925e-07, "loss": 0.0, "num_tokens": 5762944.0, "reward": -0.213707834482193, "reward_std": 0.38778313994407654, "rewards/cosine_scaled_reward/mean": -0.2318539321422577, "rewards/cosine_scaled_reward/std": 0.21436986327171326, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1583.40625, "completions/mean_terminated_length": 986.0714721679688, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.053714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.311797559261322, "learning_rate": 9.54457320834625e-07, "loss": 0.0, "num_tokens": 5874682.0, "reward": 0.27925533056259155, "reward_std": 0.6467443704605103, "rewards/cosine_scaled_reward/mean": -0.07912233471870422, "rewards/cosine_scaled_reward/std": 0.4737093150615692, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 1690.0625, "completions/mean_terminated_length": 1006.727294921875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.054857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.26644304394721985, "learning_rate": 9.509529358847654e-07, "loss": -0.0, "num_tokens": 5993390.0, "reward": 0.13692031800746918, "reward_std": 0.5655145049095154, "rewards/cosine_scaled_reward/mean": -0.12685233354568481, "rewards/cosine_scaled_reward/std": 0.32320985198020935, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.4917473793029785, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1387.140625, "completions/mean_terminated_length": 804.0294189453125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.3078882396221161, "learning_rate": 9.473264167865171e-07, "loss": 0.0, "num_tokens": 6092231.0, "reward": 0.35559189319610596, "reward_std": 0.5927403569221497, "rewards/cosine_scaled_reward/mean": -0.09564155340194702, "rewards/cosine_scaled_reward/std": 0.4046906530857086, "rewards/format_reward/mean": 0.546875, "rewards/format_reward/std": 0.501733124256134, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1674.890625, "completions/mean_terminated_length": 962.5909423828125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23925544321537018, "learning_rate": 9.43578868212728e-07, "loss": -0.0, "num_tokens": 6210240.0, "reward": 0.18573230504989624, "reward_std": 0.5264967083930969, "rewards/cosine_scaled_reward/mean": -0.09463384002447128, "rewards/cosine_scaled_reward/std": 0.4100942015647888, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.48795005679130554, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1347.40625, "completions/mean_terminated_length": 836.1621704101562, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.05828571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.325811505317688, "learning_rate": 9.397114317029974e-07, "loss": 0.0, "num_tokens": 6306682.0, "reward": 0.1735648661851883, "reward_std": 0.5335988998413086, "rewards/cosine_scaled_reward/mean": -0.21009255945682526, "rewards/cosine_scaled_reward/std": 0.2623959481716156, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.49501484632492065, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 1727.765625, "completions/mean_terminated_length": 767.0625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.05942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.27392977476119995, "learning_rate": 9.357252853159505e-07, "loss": 0.0, "num_tokens": 6428611.0, "reward": -0.16267812252044678, "reward_std": 0.5682471990585327, "rewards/cosine_scaled_reward/mean": -0.2219640612602234, "rewards/cosine_scaled_reward/std": 0.36739134788513184, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4531635046005249, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 1609.171875, "completions/mean_terminated_length": 924.5999755859375, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.060571428571428575, "frac_reward_zero_std": 0.0, "grad_norm": 0.28155064582824707, "learning_rate": 9.316216432703916e-07, "loss": -0.0, "num_tokens": 6542430.0, "reward": 0.0752667784690857, "reward_std": 0.7118167281150818, "rewards/cosine_scaled_reward/mean": -0.18892911076545715, "rewards/cosine_scaled_reward/std": 0.3222156763076782, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.501733124256134, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.53125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1588.234375, "completions/mean_terminated_length": 1067.166748046875, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.061714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2555343806743622, "learning_rate": 9.274017555754407e-07, "loss": 0.0, "num_tokens": 6655221.0, "reward": 0.6341299414634705, "reward_std": 1.0656921863555908, "rewards/cosine_scaled_reward/mean": 0.05143994837999344, "rewards/cosine_scaled_reward/std": 0.5348308086395264, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5029674172401428, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 1549.5625, "completions/mean_terminated_length": 821.0769653320312, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.06285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.30243629217147827, "learning_rate": 9.230669076497687e-07, "loss": -0.0, "num_tokens": 6764681.0, "reward": 0.13021975755691528, "reward_std": 0.3984764516353607, "rewards/cosine_scaled_reward/mean": -0.13801513612270355, "rewards/cosine_scaled_reward/std": 0.41228073835372925, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.49501484632492065, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1633.25, "completions/mean_terminated_length": 1132.689697265625, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.23835402727127075, "learning_rate": 9.186184199300463e-07, "loss": -0.0, "num_tokens": 6880169.0, "reward": 0.27981996536254883, "reward_std": 0.5018116235733032, "rewards/cosine_scaled_reward/mean": -0.10227750986814499, "rewards/cosine_scaled_reward/std": 0.481824666261673, "rewards/format_reward/mean": 0.484375, "rewards/format_reward/std": 0.5037065148353577, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 1699.875, "completions/mean_terminated_length": 1156.7999267578125, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.06514285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.22349494695663452, "learning_rate": 9.140576474687263e-07, "loss": 0.0, "num_tokens": 7000529.0, "reward": -0.026505012065172195, "reward_std": 0.5785415172576904, "rewards/cosine_scaled_reward/mean": -0.20856501162052155, "rewards/cosine_scaled_reward/std": 0.2749907374382019, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.4917473793029785, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1457.875, "completions/mean_terminated_length": 1054.105224609375, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.06628571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.261942595243454, "learning_rate": 9.093859795212817e-07, "loss": 0.0, "num_tokens": 7103929.0, "reward": 0.5745843648910522, "reward_std": 0.8671218156814575, "rewards/cosine_scaled_reward/mean": -0.03302033245563507, "rewards/cosine_scaled_reward/std": 0.45529407262802124, "rewards/format_reward/mean": 0.640625, "rewards/format_reward/std": 0.4836103618144989, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1590.0625, "completions/mean_terminated_length": 1159.8787841796875, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.06742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24828943610191345, "learning_rate": 9.046048391230247e-07, "loss": -0.0, "num_tokens": 7216157.0, "reward": 0.3377103805541992, "reward_std": 0.5543617010116577, "rewards/cosine_scaled_reward/mean": -0.1045822948217392, "rewards/cosine_scaled_reward/std": 0.39040952920913696, "rewards/format_reward/mean": 0.546875, "rewards/format_reward/std": 0.501733124256134, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1622.84375, "completions/mean_terminated_length": 1076.21435546875, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.06857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2752656936645508, "learning_rate": 8.997156826556369e-07, "loss": -0.0, "num_tokens": 7330907.0, "reward": 0.11114693433046341, "reward_std": 0.6926254034042358, "rewards/cosine_scaled_reward/mean": -0.1788015365600586, "rewards/cosine_scaled_reward/std": 0.39409172534942627, "rewards/format_reward/mean": 0.46875, "rewards/format_reward/std": 0.5029674172401428, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1708.859375, "completions/mean_terminated_length": 1014.4285888671875, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.06971428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.22669929265975952, "learning_rate": 8.9471999940354e-07, "loss": -0.0, "num_tokens": 7451794.0, "reward": 0.2345120906829834, "reward_std": 0.6293160319328308, "rewards/cosine_scaled_reward/mean": -0.1093064472079277, "rewards/cosine_scaled_reward/std": 0.29189831018447876, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.501733124256134, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1281.53125, "completions/mean_terminated_length": 1004.2978515625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.07085714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.25438693165779114, "learning_rate": 8.896193111002475e-07, "loss": 0.0, "num_tokens": 7544044.0, "reward": 0.9180847406387329, "reward_std": 0.6390912532806396, "rewards/cosine_scaled_reward/mean": 0.06841734796762466, "rewards/cosine_scaled_reward/std": 0.48315128684043884, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 1310.46875, "completions/mean_terminated_length": 896.731689453125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.28795576095581055, "learning_rate": 8.844151714648274e-07, "loss": -0.0, "num_tokens": 7638170.0, "reward": 0.6488770246505737, "reward_std": 0.7876260876655579, "rewards/cosine_scaled_reward/mean": -0.019311510026454926, "rewards/cosine_scaled_reward/std": 0.4736698865890503, "rewards/format_reward/mean": 0.6875, "rewards/format_reward/std": 0.467176616191864, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 1307.625, "completions/mean_terminated_length": 1039.8297119140625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.07314285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.25637197494506836, "learning_rate": 8.791091657286267e-07, "loss": -0.0, "num_tokens": 7732810.0, "reward": 0.8280279636383057, "reward_std": 0.6804471015930176, "rewards/cosine_scaled_reward/mean": 0.015576483681797981, "rewards/cosine_scaled_reward/std": 0.44819310307502747, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 1322.125, "completions/mean_terminated_length": 914.9268188476562, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.07428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2944399118423462, "learning_rate": 8.737029101523929e-07, "loss": -0.0, "num_tokens": 7828130.0, "reward": 0.15610456466674805, "reward_std": 0.4606686234474182, "rewards/cosine_scaled_reward/mean": -0.24226020276546478, "rewards/cosine_scaled_reward/std": 0.33131492137908936, "rewards/format_reward/mean": 0.640625, "rewards/format_reward/std": 0.4836103618144989, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 1020.21875, "completions/mean_terminated_length": 806.9057006835938, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.07542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.32644009590148926, "learning_rate": 8.681980515339463e-07, "loss": 0.0, "num_tokens": 7903656.0, "reward": 0.7972471714019775, "reward_std": 0.7674820423126221, "rewards/cosine_scaled_reward/mean": -0.031063925474882126, "rewards/cosine_scaled_reward/std": 0.5106223225593567, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 1750.859375, "completions/mean_terminated_length": 1142.4285888671875, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.07657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2270829975605011, "learning_rate": 8.625962667065487e-07, "loss": 0.0, "num_tokens": 8026447.0, "reward": -0.1400720775127411, "reward_std": 0.3325888514518738, "rewards/cosine_scaled_reward/mean": -0.24972353875637054, "rewards/cosine_scaled_reward/std": 0.16404789686203003, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.4836103618144989, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1424.0, "completions/mean_length": 769.546875, "completions/mean_terminated_length": 637.2930908203125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.07771428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.37025144696235657, "learning_rate": 8.568992620281243e-07, "loss": -0.0, "num_tokens": 8084954.0, "reward": 0.9792699813842773, "reward_std": 0.804767370223999, "rewards/cosine_scaled_reward/mean": 0.03651002421975136, "rewards/cosine_scaled_reward/std": 0.46041443943977356, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 1086.234375, "completions/mean_terminated_length": 886.6226806640625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.07885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3763800263404846, "learning_rate": 8.511087728614862e-07, "loss": 0.0, "num_tokens": 8164817.0, "reward": 0.35803771018981934, "reward_std": 0.5702667236328125, "rewards/cosine_scaled_reward/mean": -0.24285613000392914, "rewards/cosine_scaled_reward/std": 0.3019825220108032, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 1463.375, "completions/mean_terminated_length": 1112.5999755859375, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.24232418835163116, "learning_rate": 8.452265630457282e-07, "loss": -0.0, "num_tokens": 8269929.0, "reward": 0.3703588843345642, "reward_std": 0.7288752794265747, "rewards/cosine_scaled_reward/mean": -0.1351330280303955, "rewards/cosine_scaled_reward/std": 0.3751916289329529, "rewards/format_reward/mean": 0.640625, "rewards/format_reward/std": 0.4836103618144989, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1409.859375, "completions/mean_terminated_length": 973.2368774414062, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.08114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.300010621547699, "learning_rate": 8.392544243589427e-07, "loss": 0.0, "num_tokens": 8370880.0, "reward": 0.5196826457977295, "reward_std": 0.7097917795181274, "rewards/cosine_scaled_reward/mean": -0.044846177101135254, "rewards/cosine_scaled_reward/std": 0.508389949798584, "rewards/format_reward/mean": 0.609375, "rewards/format_reward/std": 0.4917473793029785, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 1228.046875, "completions/mean_terminated_length": 931.4680786132812, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.08228571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.30454304814338684, "learning_rate": 8.331941759724268e-07, "loss": -0.0, "num_tokens": 8459827.0, "reward": 0.41365131735801697, "reward_std": 0.5005639791488647, "rewards/cosine_scaled_reward/mean": -0.1759868562221527, "rewards/cosine_scaled_reward/std": 0.19868774712085724, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 1513.28125, "completions/mean_terminated_length": 1192.4500732421875, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.08342857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.27848970890045166, "learning_rate": 8.270476638965461e-07, "loss": -0.0, "num_tokens": 8567405.0, "reward": 0.09570223093032837, "reward_std": 0.5445049405097961, "rewards/cosine_scaled_reward/mean": -0.2802739143371582, "rewards/cosine_scaled_reward/std": 0.25603488087654114, "rewards/format_reward/mean": 0.65625, "rewards/format_reward/std": 0.4787135720252991, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1240.125, "completions/mean_terminated_length": 924.0, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.08457142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.2789021134376526, "learning_rate": 8.208167604184217e-07, "loss": 0.0, "num_tokens": 8656701.0, "reward": 0.7823752760887146, "reward_std": 0.6479132175445557, "rewards/cosine_scaled_reward/mean": 0.031812600791454315, "rewards/cosine_scaled_reward/std": 0.5397623181343079, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1455.953125, "completions/mean_terminated_length": 1186.8409423828125, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.08571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.22443196177482605, "learning_rate": 8.145033635316128e-07, "loss": 0.0, "num_tokens": 8760842.0, "reward": 0.8040015697479248, "reward_std": 0.5675323009490967, "rewards/cosine_scaled_reward/mean": 0.027000809088349342, "rewards/cosine_scaled_reward/std": 0.5096040964126587, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 1177.859375, "completions/mean_terminated_length": 863.1276245117188, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "epoch": 0.08685714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.32647648453712463, "learning_rate": 8.081093963579707e-07, "loss": 0.0, "num_tokens": 8846625.0, "reward": 0.310506671667099, "reward_std": 0.5110941529273987, "rewards/cosine_scaled_reward/mean": -0.2119341641664505, "rewards/cosine_scaled_reward/std": 0.24737994372844696, "rewards/format_reward/mean": 0.734375, "rewards/format_reward/std": 0.44515693187713623, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 1263.4375, "completions/mean_terminated_length": 1043.760009765625, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.088, "frac_reward_zero_std": 0.0, "grad_norm": 0.2545543611049652, "learning_rate": 8.01636806561836e-07, "loss": -0.0, "num_tokens": 8939061.0, "reward": 0.5484907031059265, "reward_std": 0.48998576402664185, "rewards/cosine_scaled_reward/mean": -0.13200464844703674, "rewards/cosine_scaled_reward/std": 0.3430649936199188, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39339789748191833, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1460.78125, "completions/mean_terminated_length": 1059.0, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.08914285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2583931088447571, "learning_rate": 7.950875657567621e-07, "loss": 0.0, "num_tokens": 9043271.0, "reward": 0.6075442433357239, "reward_std": 0.6895643472671509, "rewards/cosine_scaled_reward/mean": -0.0009153857827186584, "rewards/cosine_scaled_reward/std": 0.48922818899154663, "rewards/format_reward/mean": 0.609375, "rewards/format_reward/std": 0.4917473793029785, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 1054.875, "completions/mean_terminated_length": 892.3635864257812, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.09028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.29089078307151794, "learning_rate": 7.884636689049422e-07, "loss": 0.0, "num_tokens": 9120879.0, "reward": 0.6885831356048584, "reward_std": 0.508629322052002, "rewards/cosine_scaled_reward/mean": -0.09320840239524841, "rewards/cosine_scaled_reward/std": 0.38835227489471436, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1399.046875, "completions/mean_terminated_length": 1145.1087646484375, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.09142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.27458345890045166, "learning_rate": 7.817671337095244e-07, "loss": 0.0, "num_tokens": 9220810.0, "reward": 0.5549384355545044, "reward_std": 0.7092134952545166, "rewards/cosine_scaled_reward/mean": -0.09753081202507019, "rewards/cosine_scaled_reward/std": 0.4125780463218689, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 1084.984375, "completions/mean_terminated_length": 906.6481323242188, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.09257142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.37247684597969055, "learning_rate": 7.75e-07, "loss": -0.0, "num_tokens": 9301521.0, "reward": 0.5357480049133301, "reward_std": 0.5661624670028687, "rewards/cosine_scaled_reward/mean": -0.18525099754333496, "rewards/cosine_scaled_reward/std": 0.3385297954082489, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1260.921875, "completions/mean_terminated_length": 998.5625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.09371428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.27329322695732117, "learning_rate": 7.681643291108517e-07, "loss": -0.0, "num_tokens": 9392548.0, "reward": 0.9478914737701416, "reward_std": 0.4313860237598419, "rewards/cosine_scaled_reward/mean": 0.09894578158855438, "rewards/cosine_scaled_reward/std": 0.5477120876312256, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1309.671875, "completions/mean_terminated_length": 922.9285888671875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.09485714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3202998638153076, "learning_rate": 7.612622032536507e-07, "loss": -0.0, "num_tokens": 9487455.0, "reward": 0.5201998949050903, "reward_std": 0.6858996152877808, "rewards/cosine_scaled_reward/mean": -0.09927503764629364, "rewards/cosine_scaled_reward/std": 0.37909674644470215, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 1185.703125, "completions/mean_terminated_length": 965.9019775390625, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.29646041989326477, "learning_rate": 7.54295724882796e-07, "loss": -0.0, "num_tokens": 9574036.0, "reward": 0.6779025793075562, "reward_std": 0.557724118232727, "rewards/cosine_scaled_reward/mean": -0.09073619544506073, "rewards/cosine_scaled_reward/std": 0.3855368196964264, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1297.828125, "completions/mean_terminated_length": 1158.907470703125, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "epoch": 0.09714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.21307455003261566, "learning_rate": 7.472670160550848e-07, "loss": 0.0, "num_tokens": 9667417.0, "reward": 0.5093189477920532, "reward_std": 0.6006681323051453, "rewards/cosine_scaled_reward/mean": -0.1672155261039734, "rewards/cosine_scaled_reward/std": 0.34896284341812134, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1348.90625, "completions/mean_terminated_length": 1096.04248046875, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 0.09828571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.2883393168449402, "learning_rate": 7.401782177833147e-07, "loss": -0.0, "num_tokens": 9764603.0, "reward": 0.8025823831558228, "reward_std": 0.547119677066803, "rewards/cosine_scaled_reward/mean": 0.01847870647907257, "rewards/cosine_scaled_reward/std": 0.4346420168876648, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1086.96875, "completions/mean_terminated_length": 909.0, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.09942857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.31888866424560547, "learning_rate": 7.330314893841101e-07, "loss": -0.0, "num_tokens": 9844289.0, "reward": 0.5533354878425598, "reward_std": 0.5319498777389526, "rewards/cosine_scaled_reward/mean": -0.1530197560787201, "rewards/cosine_scaled_reward/std": 0.2434682846069336, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 954.921875, "completions/mean_terminated_length": 919.6612548828125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.10057142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.3025936484336853, "learning_rate": 7.258290078201731e-07, "loss": -0.0, "num_tokens": 9915916.0, "reward": 1.2692296504974365, "reward_std": 0.5115163326263428, "rewards/cosine_scaled_reward/mean": 0.13461479544639587, "rewards/cosine_scaled_reward/std": 0.506001353263855, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 1351.8125, "completions/mean_terminated_length": 1174.35302734375, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "epoch": 0.10171428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.23423585295677185, "learning_rate": 7.185729670371604e-07, "loss": -0.0, "num_tokens": 10013432.0, "reward": 0.724889874458313, "reward_std": 0.7425336837768555, "rewards/cosine_scaled_reward/mean": -0.0828675627708435, "rewards/cosine_scaled_reward/std": 0.3893774449825287, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1906.0, "completions/mean_length": 1153.28125, "completions/mean_terminated_length": 1025.46435546875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.10285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3860023021697998, "learning_rate": 7.11265577295385e-07, "loss": -0.0, "num_tokens": 10097242.0, "reward": 0.5000253915786743, "reward_std": 0.5103108286857605, "rewards/cosine_scaled_reward/mean": -0.18748730421066284, "rewards/cosine_scaled_reward/std": 0.2787182629108429, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1455.484375, "completions/mean_terminated_length": 1166.1163330078125, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.104, "frac_reward_zero_std": 0.0, "grad_norm": 0.2551063895225525, "learning_rate": 7.039090644965509e-07, "loss": 0.0, "num_tokens": 10200961.0, "reward": 0.4053259789943695, "reward_std": 0.663999617099762, "rewards/cosine_scaled_reward/mean": -0.18796202540397644, "rewards/cosine_scaled_reward/std": 0.35777655243873596, "rewards/format_reward/mean": 0.78125, "rewards/format_reward/std": 0.4166666865348816, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1176.953125, "completions/mean_terminated_length": 1015.6481323242188, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.10514285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.27449366450309753, "learning_rate": 6.965056695057204e-07, "loss": -0.0, "num_tokens": 10286278.0, "reward": 0.5743436217308044, "reward_std": 0.6229422092437744, "rewards/cosine_scaled_reward/mean": -0.15032817423343658, "rewards/cosine_scaled_reward/std": 0.2899566888809204, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1434.875, "completions/mean_terminated_length": 1156.181884765625, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.10628571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2839376926422119, "learning_rate": 6.890576474687263e-07, "loss": 0.0, "num_tokens": 10389454.0, "reward": 0.30658647418022156, "reward_std": 0.5343226194381714, "rewards/cosine_scaled_reward/mean": -0.22951926290988922, "rewards/cosine_scaled_reward/std": 0.2324177473783493, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 1242.390625, "completions/mean_terminated_length": 927.1522216796875, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.10742857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2985072433948517, "learning_rate": 6.815672671252315e-07, "loss": 0.0, "num_tokens": 10478735.0, "reward": 0.6593698263168335, "reward_std": 0.5845412015914917, "rewards/cosine_scaled_reward/mean": -0.02969011664390564, "rewards/cosine_scaled_reward/std": 0.47056320309638977, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 1203.265625, "completions/mean_terminated_length": 1082.58935546875, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.10857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2689598798751831, "learning_rate": 6.740368101176495e-07, "loss": 0.0, "num_tokens": 10566272.0, "reward": 0.4301251173019409, "reward_std": 0.4795047640800476, "rewards/cosine_scaled_reward/mean": -0.22243742644786835, "rewards/cosine_scaled_reward/std": 0.2575407326221466, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 1205.5625, "completions/mean_terminated_length": 990.8235473632812, "completions/min_length": 441.0, "completions/min_terminated_length": 441.0, "epoch": 0.10971428571428571, "frac_reward_zero_std": 0.0, "grad_norm": 0.30502915382385254, "learning_rate": 6.664685702961344e-07, "loss": -0.0, "num_tokens": 10654564.0, "reward": 0.896080493927002, "reward_std": 0.6987663507461548, "rewards/cosine_scaled_reward/mean": 0.02616523765027523, "rewards/cosine_scaled_reward/std": 0.460237056016922, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 1170.390625, "completions/mean_terminated_length": 988.2453002929688, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "epoch": 0.11085714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3103901743888855, "learning_rate": 6.588648530198504e-07, "loss": -0.0, "num_tokens": 10739733.0, "reward": 0.6633297204971313, "reward_std": 0.609075665473938, "rewards/cosine_scaled_reward/mean": -0.12927262485027313, "rewards/cosine_scaled_reward/std": 0.4114542305469513, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 1136.5625, "completions/mean_terminated_length": 947.396240234375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.2510873079299927, "learning_rate": 6.512279744547392e-07, "loss": 0.0, "num_tokens": 10823537.0, "reward": 0.6613268256187439, "reward_std": 0.4785424768924713, "rewards/cosine_scaled_reward/mean": -0.09902409464120865, "rewards/cosine_scaled_reward/std": 0.4345317482948303, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1171.8125, "completions/mean_terminated_length": 1081.17236328125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.11314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.281054824590683, "learning_rate": 6.435602608679916e-07, "loss": -0.0, "num_tokens": 10909701.0, "reward": 1.0416245460510254, "reward_std": 0.6949809789657593, "rewards/cosine_scaled_reward/mean": 0.0520622618496418, "rewards/cosine_scaled_reward/std": 0.508481502532959, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24397502839565277, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 1120.8125, "completions/mean_terminated_length": 1024.8966064453125, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.11428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.2910788655281067, "learning_rate": 6.358640479194451e-07, "loss": 0.0, "num_tokens": 10991145.0, "reward": 1.2036188840866089, "reward_std": 0.8533884286880493, "rewards/cosine_scaled_reward/mean": 0.14087192714214325, "rewards/cosine_scaled_reward/std": 0.5375887751579285, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1076.953125, "completions/mean_terminated_length": 1029.1966552734375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.11542857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.33955609798431396, "learning_rate": 6.281416799501187e-07, "loss": 0.0, "num_tokens": 11071502.0, "reward": 0.7810705900192261, "reward_std": 0.5973731279373169, "rewards/cosine_scaled_reward/mean": -0.10165221989154816, "rewards/cosine_scaled_reward/std": 0.4130260646343231, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1092.078125, "completions/mean_terminated_length": 935.654541015625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.11657142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.34537607431411743, "learning_rate": 6.203955092681039e-07, "loss": 0.0, "num_tokens": 11151547.0, "reward": 0.6441041231155396, "reward_std": 0.53089839220047, "rewards/cosine_scaled_reward/mean": -0.10763543844223022, "rewards/cosine_scaled_reward/std": 0.39948928356170654, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1120.625, "completions/mean_terminated_length": 1006.7368774414062, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.11771428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.343980997800827, "learning_rate": 6.126278954320294e-07, "loss": 0.0, "num_tokens": 11233619.0, "reward": 0.6925251483917236, "reward_std": 0.5938367247581482, "rewards/cosine_scaled_reward/mean": -0.13029994070529938, "rewards/cosine_scaled_reward/std": 0.37749138474464417, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 1120.359375, "completions/mean_terminated_length": 948.5740966796875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.11885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.30854102969169617, "learning_rate": 6.048412045323164e-07, "loss": -0.0, "num_tokens": 11315786.0, "reward": 0.560060977935791, "reward_std": 0.5216183662414551, "rewards/cosine_scaled_reward/mean": -0.1418444812297821, "rewards/cosine_scaled_reward/std": 0.33836889266967773, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 1158.421875, "completions/mean_terminated_length": 953.1346435546875, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 0.12, "frac_reward_zero_std": 0.0, "grad_norm": 0.29531243443489075, "learning_rate": 5.97037808470444e-07, "loss": -0.0, "num_tokens": 11401213.0, "reward": 1.0410652160644531, "reward_std": 0.7858219742774963, "rewards/cosine_scaled_reward/mean": 0.09084508568048477, "rewards/cosine_scaled_reward/std": 0.5061684250831604, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 1045.859375, "completions/mean_terminated_length": 837.867919921875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.12114285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.26259294152259827, "learning_rate": 5.892200842364462e-07, "loss": -0.0, "num_tokens": 11478980.0, "reward": 1.0545225143432617, "reward_std": 0.7633667588233948, "rewards/cosine_scaled_reward/mean": 0.07413630187511444, "rewards/cosine_scaled_reward/std": 0.48842984437942505, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1101.234375, "completions/mean_terminated_length": 946.30908203125, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.12228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.3363504409790039, "learning_rate": 5.813904131848564e-07, "loss": 0.0, "num_tokens": 11560611.0, "reward": 0.648673415184021, "reward_std": 0.6051540970802307, "rewards/cosine_scaled_reward/mean": -0.11316327750682831, "rewards/cosine_scaled_reward/std": 0.37149766087532043, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 1225.28125, "completions/mean_terminated_length": 1054.5283203125, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.12342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2867675721645355, "learning_rate": 5.735511803093248e-07, "loss": 0.0, "num_tokens": 11649389.0, "reward": 0.560509204864502, "reward_std": 0.6691359877586365, "rewards/cosine_scaled_reward/mean": -0.14943289756774902, "rewards/cosine_scaled_reward/std": 0.4461749494075775, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1227.203125, "completions/mean_terminated_length": 1056.84912109375, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "epoch": 0.12457142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2772690951824188, "learning_rate": 5.657047735161255e-07, "loss": -0.0, "num_tokens": 11739178.0, "reward": 0.6980891227722168, "reward_std": 0.624833345413208, "rewards/cosine_scaled_reward/mean": -0.0650179386138916, "rewards/cosine_scaled_reward/std": 0.41062912344932556, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.38025420904159546, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 1145.0, "completions/mean_terminated_length": 914.8235473632812, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "epoch": 0.12571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.3468596637248993, "learning_rate": 5.578535828967777e-07, "loss": -0.0, "num_tokens": 11823234.0, "reward": 0.6972323656082153, "reward_std": 0.5477026104927063, "rewards/cosine_scaled_reward/mean": -0.08888379484415054, "rewards/cosine_scaled_reward/std": 0.3565239906311035, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.3333333432674408, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1969.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 977.046875, "completions/mean_terminated_length": 977.046875, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.12685714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3180137574672699, "learning_rate": 5.5e-07, "loss": 0.0, "num_tokens": 11895885.0, "reward": 0.8744360208511353, "reward_std": 0.5815237164497375, "rewards/cosine_scaled_reward/mean": -0.06278196722269058, "rewards/cosine_scaled_reward/std": 0.37791064381599426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 1269.421875, "completions/mean_terminated_length": 1089.75, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "epoch": 0.128, "frac_reward_zero_std": 0.0, "grad_norm": 0.2817465364933014, "learning_rate": 5.421464171032224e-07, "loss": -0.0, "num_tokens": 11988224.0, "reward": 0.9151681065559387, "reward_std": 0.594943642616272, "rewards/cosine_scaled_reward/mean": 0.02789657562971115, "rewards/cosine_scaled_reward/std": 0.4965399205684662, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.3503824472427368, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 934.578125, "completions/mean_terminated_length": 934.578125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.12914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.3341560959815979, "learning_rate": 5.342952264838747e-07, "loss": -0.0, "num_tokens": 12058333.0, "reward": 1.0256879329681396, "reward_std": 0.717230498790741, "rewards/cosine_scaled_reward/mean": 0.02065650373697281, "rewards/cosine_scaled_reward/std": 0.4963410794734955, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 1055.21875, "completions/mean_terminated_length": 971.0847778320312, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.13028571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3800676763057709, "learning_rate": 5.264488196906752e-07, "loss": -0.0, "num_tokens": 12135715.0, "reward": 0.649993896484375, "reward_std": 0.5865596532821655, "rewards/cosine_scaled_reward/mean": -0.1750030517578125, "rewards/cosine_scaled_reward/std": 0.3388007879257202, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 1169.671875, "completions/mean_terminated_length": 987.3773803710938, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.13142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.3112519085407257, "learning_rate": 5.186095868151436e-07, "loss": 0.0, "num_tokens": 12221790.0, "reward": 0.7184536457061768, "reward_std": 0.44992831349372864, "rewards/cosine_scaled_reward/mean": -0.06264819949865341, "rewards/cosine_scaled_reward/std": 0.44565486907958984, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 1224.890625, "completions/mean_terminated_length": 1072.4630126953125, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.13257142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.2884223461151123, "learning_rate": 5.107799157635538e-07, "loss": 0.0, "num_tokens": 12311567.0, "reward": 0.8372049927711487, "reward_std": 0.608986496925354, "rewards/cosine_scaled_reward/mean": -0.026710007339715958, "rewards/cosine_scaled_reward/std": 0.4437602162361145, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 1078.65625, "completions/mean_terminated_length": 1030.9835205078125, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "epoch": 0.1337142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.3016076385974884, "learning_rate": 5.02962191529556e-07, "loss": -0.0, "num_tokens": 12391625.0, "reward": 0.8182538747787476, "reward_std": 0.6463132500648499, "rewards/cosine_scaled_reward/mean": -0.09087307006120682, "rewards/cosine_scaled_reward/std": 0.3895137310028076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 1226.046875, "completions/mean_terminated_length": 952.0625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.13485714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.2991194427013397, "learning_rate": 4.951587954676837e-07, "loss": 0.0, "num_tokens": 12480628.0, "reward": 0.6370267868041992, "reward_std": 0.7525250911712646, "rewards/cosine_scaled_reward/mean": -0.056486621499061584, "rewards/cosine_scaled_reward/std": 0.44576171040534973, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4364357888698578, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1038.96875, "completions/mean_terminated_length": 894.8214721679688, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.136, "frac_reward_zero_std": 0.0, "grad_norm": 0.4483291506767273, "learning_rate": 4.873721045679706e-07, "loss": 0.0, "num_tokens": 12557530.0, "reward": 0.9855979084968567, "reward_std": 0.6055079698562622, "rewards/cosine_scaled_reward/mean": 0.04748644679784775, "rewards/cosine_scaled_reward/std": 0.47108832001686096, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 856.578125, "completions/mean_terminated_length": 818.1451416015625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.13714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.3406151831150055, "learning_rate": 4.79604490731896e-07, "loss": -0.0, "num_tokens": 12622807.0, "reward": 0.7979192733764648, "reward_std": 0.6180044412612915, "rewards/cosine_scaled_reward/mean": -0.10104038566350937, "rewards/cosine_scaled_reward/std": 0.44317325949668884, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 726.34375, "completions/mean_terminated_length": 683.7096557617188, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.1382857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.4178949296474457, "learning_rate": 4.7185832004988133e-07, "loss": 0.0, "num_tokens": 12678989.0, "reward": 1.161607265472412, "reward_std": 0.6393733024597168, "rewards/cosine_scaled_reward/mean": 0.08080361783504486, "rewards/cosine_scaled_reward/std": 0.5313310027122498, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1133.796875, "completions/mean_terminated_length": 1039.22412109375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.13942857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.3333284258842468, "learning_rate": 4.641359520805548e-07, "loss": 0.0, "num_tokens": 12763112.0, "reward": 0.9356573820114136, "reward_std": 0.6247758269309998, "rewards/cosine_scaled_reward/mean": -0.02435879409313202, "rewards/cosine_scaled_reward/std": 0.4759780466556549, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1031.296875, "completions/mean_terminated_length": 981.2950439453125, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.14057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.29939791560173035, "learning_rate": 4.5643973913200837e-07, "loss": -0.0, "num_tokens": 12839347.0, "reward": 0.7725162506103516, "reward_std": 0.5560778379440308, "rewards/cosine_scaled_reward/mean": -0.09811685979366302, "rewards/cosine_scaled_reward/std": 0.3822804391384125, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 979.234375, "completions/mean_terminated_length": 944.758056640625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.1417142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.34992095828056335, "learning_rate": 4.4877202554526084e-07, "loss": 0.0, "num_tokens": 12912970.0, "reward": 1.085427165031433, "reward_std": 0.6837464570999146, "rewards/cosine_scaled_reward/mean": 0.05052608996629715, "rewards/cosine_scaled_reward/std": 0.4791998267173767, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1076.40625, "completions/mean_terminated_length": 994.0678100585938, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.27060386538505554, "learning_rate": 4.4113514698014953e-07, "loss": -0.0, "num_tokens": 12992788.0, "reward": 1.0397578477859497, "reward_std": 0.43823006749153137, "rewards/cosine_scaled_reward/mean": 0.019878946244716644, "rewards/cosine_scaled_reward/std": 0.46214956045150757, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1071.53125, "completions/mean_terminated_length": 1006.4334106445312, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "epoch": 0.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.2776121497154236, "learning_rate": 4.3353142970386557e-07, "loss": 0.0, "num_tokens": 13072662.0, "reward": 1.0028693675994873, "reward_std": 0.6879971027374268, "rewards/cosine_scaled_reward/mean": 0.0014346465468406677, "rewards/cosine_scaled_reward/std": 0.42488595843315125, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 1180.484375, "completions/mean_terminated_length": 1056.5535888671875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.14514285714285713, "frac_reward_zero_std": 0.0, "grad_norm": 0.2829054594039917, "learning_rate": 4.2596318988235037e-07, "loss": -0.0, "num_tokens": 13159309.0, "reward": 0.6576684713363647, "reward_std": 0.66895592212677, "rewards/cosine_scaled_reward/mean": -0.15554077923297882, "rewards/cosine_scaled_reward/std": 0.3959099054336548, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 1053.328125, "completions/mean_terminated_length": 950.4310302734375, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.1462857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.29738253355026245, "learning_rate": 4.1843273287476854e-07, "loss": -0.0, "num_tokens": 13237074.0, "reward": 0.8851851224899292, "reward_std": 0.7390589118003845, "rewards/cosine_scaled_reward/mean": -0.041782446205616, "rewards/cosine_scaled_reward/std": 0.46901625394821167, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 1228.484375, "completions/mean_terminated_length": 1111.4107666015625, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.14742857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.25943535566329956, "learning_rate": 4.1094235253127374e-07, "loss": -0.0, "num_tokens": 13326401.0, "reward": 0.9628820419311523, "reward_std": 0.6490253210067749, "rewards/cosine_scaled_reward/mean": 0.004878522828221321, "rewards/cosine_scaled_reward/std": 0.45456331968307495, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1089.578125, "completions/mean_terminated_length": 952.6607666015625, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.14857142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.3009719252586365, "learning_rate": 4.034943304942796e-07, "loss": 0.0, "num_tokens": 13406638.0, "reward": 0.5984547138214111, "reward_std": 0.7008002996444702, "rewards/cosine_scaled_reward/mean": -0.14608514308929443, "rewards/cosine_scaled_reward/std": 0.37894922494888306, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 1058.03125, "completions/mean_terminated_length": 916.607177734375, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "epoch": 0.14971428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.306725412607193, "learning_rate": 3.9609093550344907e-07, "loss": 0.0, "num_tokens": 13484088.0, "reward": 1.0469268560409546, "reward_std": 0.6023457050323486, "rewards/cosine_scaled_reward/mean": 0.0703384131193161, "rewards/cosine_scaled_reward/std": 0.47298464179039, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 1342.78125, "completions/mean_terminated_length": 919.6500244140625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.15085714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.3032574951648712, "learning_rate": 3.8873442270461485e-07, "loss": -0.0, "num_tokens": 13581090.0, "reward": 0.4643245339393616, "reward_std": 0.7533800601959229, "rewards/cosine_scaled_reward/mean": -0.06471271812915802, "rewards/cosine_scaled_reward/std": 0.4610835611820221, "rewards/format_reward/mean": 0.59375, "rewards/format_reward/std": 0.49501484632492065, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1144.921875, "completions/mean_terminated_length": 957.4906005859375, "completions/min_length": 451.0, "completions/min_terminated_length": 451.0, "epoch": 0.152, "frac_reward_zero_std": 0.0, "grad_norm": 0.32285141944885254, "learning_rate": 3.8142703296283953e-07, "loss": 0.0, "num_tokens": 13665589.0, "reward": 0.5014957189559937, "reward_std": 0.5352932214736938, "rewards/cosine_scaled_reward/mean": -0.17112717032432556, "rewards/cosine_scaled_reward/std": 0.28127768635749817, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 975.53125, "completions/mean_terminated_length": 958.5079956054688, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.15314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.40716752409935, "learning_rate": 3.7417099217982686e-07, "loss": -0.0, "num_tokens": 13738591.0, "reward": 1.1759617328643799, "reward_std": 0.4804629683494568, "rewards/cosine_scaled_reward/mean": 0.08798093348741531, "rewards/cosine_scaled_reward/std": 0.5343761444091797, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 758.515625, "completions/mean_terminated_length": 758.515625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.15428571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.42696353793144226, "learning_rate": 3.6696851061588994e-07, "loss": -0.0, "num_tokens": 13797608.0, "reward": 1.3851683139801025, "reward_std": 0.5234883427619934, "rewards/cosine_scaled_reward/mean": 0.19258417189121246, "rewards/cosine_scaled_reward/std": 0.49346473813056946, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1169.875, "completions/mean_terminated_length": 1095.4576416015625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.15542857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.28027620911598206, "learning_rate": 3.5982178221668533e-07, "loss": -0.0, "num_tokens": 13883152.0, "reward": 1.0174503326416016, "reward_std": 0.5889347791671753, "rewards/cosine_scaled_reward/mean": 0.016537662595510483, "rewards/cosine_scaled_reward/std": 0.4763922095298767, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 1105.3125, "completions/mean_terminated_length": 1042.4666748046875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 0.15657142857142858, "frac_reward_zero_std": 0.0, "grad_norm": 0.3002299666404724, "learning_rate": 3.5273298394491515e-07, "loss": 0.0, "num_tokens": 13964500.0, "reward": 0.841381847858429, "reward_std": 0.6354345083236694, "rewards/cosine_scaled_reward/mean": -0.07149658352136612, "rewards/cosine_scaled_reward/std": 0.4138363003730774, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1125.484375, "completions/mean_terminated_length": 974.5272216796875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.15771428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.28766506910324097, "learning_rate": 3.45704275117204e-07, "loss": -0.0, "num_tokens": 14047843.0, "reward": 0.8758631944656372, "reward_std": 0.7212573289871216, "rewards/cosine_scaled_reward/mean": -0.05425591766834259, "rewards/cosine_scaled_reward/std": 0.4783853590488434, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1216.171875, "completions/mean_terminated_length": 1160.7166748046875, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.15885714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2882857024669647, "learning_rate": 3.387377967463493e-07, "loss": -0.0, "num_tokens": 14136318.0, "reward": 0.7189284563064575, "reward_std": 0.4593912959098816, "rewards/cosine_scaled_reward/mean": -0.13272328674793243, "rewards/cosine_scaled_reward/std": 0.33584704995155334, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1142.140625, "completions/mean_terminated_length": 1012.732177734375, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.16, "frac_reward_zero_std": 0.125, "grad_norm": 0.3000667095184326, "learning_rate": 3.3183567088914833e-07, "loss": 0.0, "num_tokens": 14219639.0, "reward": 0.8278639316558838, "reward_std": 0.46724599599838257, "rewards/cosine_scaled_reward/mean": -0.03919300064444542, "rewards/cosine_scaled_reward/std": 0.4650508463382721, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 1025.421875, "completions/mean_terminated_length": 975.131103515625, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "epoch": 0.16114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.3207882046699524, "learning_rate": 3.250000000000001e-07, "loss": 0.0, "num_tokens": 14295826.0, "reward": 0.8871637582778931, "reward_std": 0.6538586616516113, "rewards/cosine_scaled_reward/mean": -0.04079316183924675, "rewards/cosine_scaled_reward/std": 0.43451616168022156, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1233.90625, "completions/mean_terminated_length": 1149.689697265625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.16228571428571428, "frac_reward_zero_std": 0.0, "grad_norm": 0.3009903132915497, "learning_rate": 3.182328662904756e-07, "loss": 0.0, "num_tokens": 14385300.0, "reward": 0.8573208451271057, "reward_std": 0.6099269390106201, "rewards/cosine_scaled_reward/mean": -0.055714573711156845, "rewards/cosine_scaled_reward/std": 0.43728360533714294, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 1136.078125, "completions/mean_terminated_length": 1005.8035888671875, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.16342857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.31794917583465576, "learning_rate": 3.115363310950578e-07, "loss": 0.0, "num_tokens": 14468825.0, "reward": 0.6553314924240112, "reward_std": 0.6344339847564697, "rewards/cosine_scaled_reward/mean": -0.11764675378799438, "rewards/cosine_scaled_reward/std": 0.3099633455276489, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1220.6875, "completions/mean_terminated_length": 1029.769287109375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.16457142857142856, "frac_reward_zero_std": 0.0, "grad_norm": 0.3814108967781067, "learning_rate": 3.0491243424323783e-07, "loss": 0.0, "num_tokens": 14558437.0, "reward": 0.7285318970680237, "reward_std": 0.8925961256027222, "rewards/cosine_scaled_reward/mean": -0.05760904401540756, "rewards/cosine_scaled_reward/std": 0.492266446352005, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.36596253514289856, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1880.0, "completions/mean_length": 969.796875, "completions/mean_terminated_length": 916.7704467773438, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.1657142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.3201180398464203, "learning_rate": 2.9836319343816397e-07, "loss": -0.0, "num_tokens": 14630448.0, "reward": 0.8149441480636597, "reward_std": 0.5824600458145142, "rewards/cosine_scaled_reward/mean": -0.08471541851758957, "rewards/cosine_scaled_reward/std": 0.475755512714386, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 1034.484375, "completions/mean_terminated_length": 966.9166870117188, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.16685714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.28184273838996887, "learning_rate": 2.918906036420294e-07, "loss": -0.0, "num_tokens": 14707271.0, "reward": 0.8387603759765625, "reward_std": 0.5346506237983704, "rewards/cosine_scaled_reward/mean": -0.07280732691287994, "rewards/cosine_scaled_reward/std": 0.43024110794067383, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1249.984375, "completions/mean_terminated_length": 1046.568603515625, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 0.168, "frac_reward_zero_std": 0.0, "grad_norm": 0.32145801186561584, "learning_rate": 2.854966364683872e-07, "loss": 0.0, "num_tokens": 14798054.0, "reward": 0.7505484819412231, "reward_std": 0.5473448634147644, "rewards/cosine_scaled_reward/mean": -0.07003828883171082, "rewards/cosine_scaled_reward/std": 0.4046306014060974, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.3145764470100403, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 1062.828125, "completions/mean_terminated_length": 960.913818359375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.16914285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.2667451500892639, "learning_rate": 2.791832395815782e-07, "loss": -0.0, "num_tokens": 14877259.0, "reward": 0.7823130488395691, "reward_std": 0.48230016231536865, "rewards/cosine_scaled_reward/mean": -0.06978099048137665, "rewards/cosine_scaled_reward/std": 0.37567150592803955, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1386.875, "completions/mean_terminated_length": 1086.3636474609375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.1702857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2730913758277893, "learning_rate": 2.729523361034538e-07, "loss": 0.0, "num_tokens": 14977915.0, "reward": 0.48214927315711975, "reward_std": 0.8376681804656982, "rewards/cosine_scaled_reward/mean": -0.14173786342144012, "rewards/cosine_scaled_reward/std": 0.4272434711456299, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 994.15625, "completions/mean_terminated_length": 942.3278198242188, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.17142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2946690022945404, "learning_rate": 2.6680582402757324e-07, "loss": -0.0, "num_tokens": 15052045.0, "reward": 0.8893749713897705, "reward_std": 0.7130615711212158, "rewards/cosine_scaled_reward/mean": -0.05531252920627594, "rewards/cosine_scaled_reward/std": 0.4389563202857971, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 150 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 15052045, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }