{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.034139159749930655, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 3047.3671875, "completions/mean_terminated_length": 2263.057861328125, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.00034139159749930657, "frac_reward_zero_std": 0.5, "grad_norm": 0.35972216725349426, "kl": 0.0014982223510742188, "learning_rate": 0.0, "loss": 0.0177, "num_tokens": 410995.0, "reward": 0.708984375, "reward_std": 0.2064703404903412, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.17173513770103455, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 3011.8359375, "completions/mean_terminated_length": 1956.5111083984375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.0006827831949986131, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3065022826194763, "kl": 0.00147247314453125, "learning_rate": 4.444444444444445e-07, "loss": 0.0107, "num_tokens": 816098.0, "reward": 0.5234375, "reward_std": 0.25197339057922363, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.19826386868953705, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2760.28125, "completions/mean_terminated_length": 2010.328369140625, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.0010241747924979196, "frac_reward_zero_std": 0.6875, "grad_norm": 159714496.0, "kl": 548864.0013122559, "learning_rate": 8.88888888888889e-07, "loss": 21942.8555, "num_tokens": 1191162.0, "reward": 0.859375, "reward_std": 0.13880334794521332, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.1746762990951538, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3580.0, "completions/mean_length": 3082.3828125, "completions/mean_terminated_length": 2055.261962890625, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 0.0013655663899972263, "frac_reward_zero_std": 0.75, "grad_norm": 0.22585342824459076, "kl": 0.0015778541564941406, "learning_rate": 1.3333333333333334e-06, "loss": 0.0142, "num_tokens": 1607087.0, "reward": 0.5390625, "reward_std": 0.046989135444164276, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.14860738813877106, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3479.0, "completions/mean_length": 2815.2578125, "completions/mean_terminated_length": 2046.515625, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.0017069579874965327, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3610924482345581, "kl": 0.0015110969543457031, "learning_rate": 1.777777777777778e-06, "loss": 0.0113, "num_tokens": 1989096.0, "reward": 0.80078125, "reward_std": 0.22490110993385315, "rewards/accuracy_reward/mean": 0.2016129046678543, "rewards/accuracy_reward/std": 0.4028322100639343, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.41015625, "rewards/tag_count_reward/std": 0.18221724033355713, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3547.0, "completions/mean_length": 2983.78125, "completions/mean_terminated_length": 1949.3616943359375, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "epoch": 0.002048349584995839, "frac_reward_zero_std": 0.53125, "grad_norm": 0.7327002286911011, "kl": 0.0018010139465332031, "learning_rate": 2.222222222222222e-06, "loss": 0.0143, "num_tokens": 2394080.0, "reward": 0.5390625, "reward_std": 0.14405298233032227, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.1854381412267685, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3223.0, "completions/mean_length": 3160.3828125, "completions/mean_terminated_length": 1776.5667724609375, "completions/min_length": 654.0, "completions/min_terminated_length": 654.0, "epoch": 0.0023897411824951456, "frac_reward_zero_std": 0.53125, "grad_norm": 0.34306737780570984, "kl": 0.001575469970703125, "learning_rate": 2.666666666666667e-06, "loss": 0.0121, "num_tokens": 2817985.0, "reward": 0.470703125, "reward_std": 0.20495890080928802, "rewards/accuracy_reward/mean": 0.05645161122083664, "rewards/accuracy_reward/std": 0.23172801733016968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.1877356618642807, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3559.0, "completions/mean_length": 2887.1015625, "completions/mean_terminated_length": 2190.203125, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.0027311327799944525, "frac_reward_zero_std": 0.375, "grad_norm": 0.4181869328022003, "kl": 0.0015878677368164062, "learning_rate": 3.1111111111111116e-06, "loss": 0.0206, "num_tokens": 3207990.0, "reward": 0.990234375, "reward_std": 0.3269186019897461, "rewards/accuracy_reward/mean": 0.2734375, "rewards/accuracy_reward/std": 0.447474867105484, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.443359375, "rewards/tag_count_reward/std": 0.20861099660396576, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3502.0, "completions/mean_length": 2897.7734375, "completions/mean_terminated_length": 1894.8270263671875, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "epoch": 0.003072524377493759, "frac_reward_zero_std": 0.5, "grad_norm": 0.4795713722705841, "kl": 0.0025396347045898438, "learning_rate": 3.555555555555556e-06, "loss": 0.0176, "num_tokens": 3598633.0, "reward": 0.71875, "reward_std": 0.24768972396850586, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.484375, "rewards/tag_count_reward/std": 0.30376574397087097, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 2891.34375, "completions/mean_terminated_length": 1971.9998779296875, "completions/min_length": 684.0, "completions/min_terminated_length": 684.0, "epoch": 0.0034139159749930655, "frac_reward_zero_std": 0.4375, "grad_norm": 2.6263070106506348, "kl": 0.005030155181884766, "learning_rate": 4.000000000000001e-06, "loss": 0.0306, "num_tokens": 3987989.0, "reward": 0.623046875, "reward_std": 0.20070670545101166, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.513671875, "rewards/tag_count_reward/std": 0.30628037452697754, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3505.0, "completions/mean_length": 2942.09375, "completions/mean_terminated_length": 2003.923095703125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "epoch": 0.003755307572492372, "frac_reward_zero_std": 0.46875, "grad_norm": 0.49031442403793335, "kl": 0.0036568641662597656, "learning_rate": 4.444444444444444e-06, "loss": 0.0228, "num_tokens": 4384753.0, "reward": 0.85546875, "reward_std": 0.39348509907722473, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.54296875, "rewards/tag_count_reward/std": 0.3380545377731323, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 3020.046875, "completions/mean_terminated_length": 2271.527099609375, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 0.004096699169991678, "frac_reward_zero_std": 0.375, "grad_norm": 0.3891095519065857, "kl": 0.0040073394775390625, "learning_rate": 4.888888888888889e-06, "loss": 0.0528, "num_tokens": 4790443.0, "reward": 0.806640625, "reward_std": 0.2784914970397949, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.587890625, "rewards/tag_count_reward/std": 0.34451529383659363, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 2736.6875, "completions/mean_terminated_length": 1745.7626953125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.004438090767490985, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3189328908920288, "kl": 0.005898475646972656, "learning_rate": 5.333333333333334e-06, "loss": -0.0068, "num_tokens": 5160971.0, "reward": 0.857421875, "reward_std": 0.16223029792308807, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.591796875, "rewards/tag_count_reward/std": 0.3643448054790497, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3572.0, "completions/mean_length": 2783.8125, "completions/mean_terminated_length": 1932.0, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 0.004779482364990291, "frac_reward_zero_std": 0.53125, "grad_norm": 0.35544490814208984, "kl": 0.004876136779785156, "learning_rate": 5.777777777777778e-06, "loss": 0.0281, "num_tokens": 5537895.0, "reward": 0.759765625, "reward_std": 0.21976111829280853, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.603515625, "rewards/tag_count_reward/std": 0.3528948724269867, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3494.0, "completions/mean_length": 2695.9609375, "completions/mean_terminated_length": 1861.742431640625, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.005120873962489598, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6192285418510437, "kl": 0.0075321197509765625, "learning_rate": 6.222222222222223e-06, "loss": 0.0295, "num_tokens": 5903546.0, "reward": 0.916015625, "reward_std": 0.32727113366127014, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.650390625, "rewards/tag_count_reward/std": 0.3526332974433899, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3519.0, "completions/mean_length": 2863.359375, "completions/mean_terminated_length": 1936.821533203125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.005462265559988905, "frac_reward_zero_std": 0.5625, "grad_norm": 0.42399972677230835, "kl": 0.009187698364257812, "learning_rate": 6.666666666666667e-06, "loss": 0.0007, "num_tokens": 6290128.0, "reward": 0.84765625, "reward_std": 0.350067675113678, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59765625, "rewards/tag_count_reward/std": 0.3648351728916168, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3436.0, "completions/mean_length": 2777.8515625, "completions/mean_terminated_length": 1804.913818359375, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.005803657157488211, "frac_reward_zero_std": 0.375, "grad_norm": 0.44070422649383545, "kl": 0.0106964111328125, "learning_rate": 7.111111111111112e-06, "loss": 0.0243, "num_tokens": 6665905.0, "reward": 0.892578125, "reward_std": 0.44644856452941895, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.611328125, "rewards/tag_count_reward/std": 0.3504897952079773, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.15625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3416.0, "completions/mean_length": 3126.234375, "completions/mean_terminated_length": 2000.37841796875, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 0.006145048754987518, "frac_reward_zero_std": 0.5, "grad_norm": 0.3213607668876648, "kl": 0.009923934936523438, "learning_rate": 7.555555555555556e-06, "loss": 0.0311, "num_tokens": 7088083.0, "reward": 0.7578125, "reward_std": 0.31506818532943726, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4765625, "rewards/tag_count_reward/std": 0.3032590448856354, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.1875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 3145.140625, "completions/mean_terminated_length": 2105.73681640625, "completions/min_length": 968.0, "completions/min_terminated_length": 968.0, "epoch": 0.006486440352486824, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3303903639316559, "kl": 0.013744354248046875, "learning_rate": 8.000000000000001e-06, "loss": 0.0219, "num_tokens": 7511277.0, "reward": 0.71484375, "reward_std": 0.26128900051116943, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.48046875, "rewards/tag_count_reward/std": 0.3099551796913147, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3528.0, "completions/mean_length": 2863.7109375, "completions/mean_terminated_length": 1876.648193359375, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 0.006827831949986131, "frac_reward_zero_std": 0.40625, "grad_norm": 0.9578901529312134, "kl": 0.02547454833984375, "learning_rate": 8.444444444444446e-06, "loss": 0.0353, "num_tokens": 7898416.0, "reward": 0.720703125, "reward_std": 0.20296892523765564, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.501953125, "rewards/tag_count_reward/std": 0.27618053555488586, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3303.0, "completions/mean_length": 2861.96875, "completions/mean_terminated_length": 1840.2264404296875, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.007169223547485438, "frac_reward_zero_std": 0.5, "grad_norm": 0.367509126663208, "kl": 0.021251678466796875, "learning_rate": 8.888888888888888e-06, "loss": 0.0205, "num_tokens": 8284368.0, "reward": 0.771484375, "reward_std": 0.335555762052536, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.521484375, "rewards/tag_count_reward/std": 0.29601576924324036, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3488.0, "completions/mean_length": 2990.0234375, "completions/mean_terminated_length": 2063.419921875, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 0.007510615144984744, "frac_reward_zero_std": 0.5, "grad_norm": 0.7796738743782043, "kl": 0.02317047119140625, "learning_rate": 9.333333333333334e-06, "loss": 0.018, "num_tokens": 8689099.0, "reward": 0.892578125, "reward_std": 0.3497520387172699, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.517578125, "rewards/tag_count_reward/std": 0.3155790865421295, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 3114.4140625, "completions/mean_terminated_length": 2357.326416015625, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 0.00785200674248405, "frac_reward_zero_std": 0.4375, "grad_norm": 1.2586368322372437, "kl": 0.02622222900390625, "learning_rate": 9.777777777777779e-06, "loss": 0.0028, "num_tokens": 9106936.0, "reward": 0.873046875, "reward_std": 0.3654538691043854, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.529296875, "rewards/tag_count_reward/std": 0.3115573823451996, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3527.0, "completions/mean_length": 2816.78125, "completions/mean_terminated_length": 1449.1304931640625, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.008193398339983357, "frac_reward_zero_std": 0.65625, "grad_norm": 0.3420628309249878, "kl": 0.03411102294921875, "learning_rate": 1.0222222222222223e-05, "loss": -0.0144, "num_tokens": 9488668.0, "reward": 0.623046875, "reward_std": 0.07987198978662491, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.498046875, "rewards/tag_count_reward/std": 0.32527410984039307, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3538.0, "completions/mean_length": 2850.078125, "completions/mean_terminated_length": 2138.738525390625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.008534789937482664, "frac_reward_zero_std": 0.28125, "grad_norm": 0.4959162175655365, "kl": 0.0364227294921875, "learning_rate": 1.0666666666666667e-05, "loss": -0.0051, "num_tokens": 9874034.0, "reward": 0.89453125, "reward_std": 0.29277488589286804, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62890625, "rewards/tag_count_reward/std": 0.3260120153427124, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 2570.6171875, "completions/mean_terminated_length": 1807.109619140625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.00887618153498197, "frac_reward_zero_std": 0.53125, "grad_norm": 0.35553115606307983, "kl": 0.044036865234375, "learning_rate": 1.1111111111111113e-05, "loss": 0.0049, "num_tokens": 10223985.0, "reward": 1.17578125, "reward_std": 0.3890746235847473, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4347693622112274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.34721100330352783, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3577.0, "completions/mean_length": 2447.1796875, "completions/mean_terminated_length": 1669.355224609375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.009217573132481277, "frac_reward_zero_std": 0.5, "grad_norm": 0.3697810173034668, "kl": 0.05316162109375, "learning_rate": 1.1555555555555556e-05, "loss": 0.0228, "num_tokens": 10555824.0, "reward": 0.876953125, "reward_std": 0.2979937791824341, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.720703125, "rewards/tag_count_reward/std": 0.34024813771247864, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3553.0, "completions/mean_length": 2933.3515625, "completions/mean_terminated_length": 1951.0001220703125, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 0.009558964729980583, "frac_reward_zero_std": 0.59375, "grad_norm": 0.29490432143211365, "kl": 0.0458831787109375, "learning_rate": 1.2e-05, "loss": 0.0232, "num_tokens": 10954845.0, "reward": 0.806640625, "reward_std": 0.34365934133529663, "rewards/accuracy_reward/mean": 0.12096773833036423, "rewards/accuracy_reward/std": 0.3274126350879669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.572265625, "rewards/tag_count_reward/std": 0.35653674602508545, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2937.9453125, "completions/mean_terminated_length": 2291.890625, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 0.00990035632747989, "frac_reward_zero_std": 0.4375, "grad_norm": 0.40248623490333557, "kl": 0.0478973388671875, "learning_rate": 1.2444444444444446e-05, "loss": 0.0225, "num_tokens": 11350998.0, "reward": 0.908203125, "reward_std": 0.3623540699481964, "rewards/accuracy_reward/mean": 0.12903225421905518, "rewards/accuracy_reward/std": 0.3365956246852875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.658203125, "rewards/tag_count_reward/std": 0.36163330078125, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2790.6171875, "completions/mean_terminated_length": 1737.581787109375, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 0.010241747924979196, "frac_reward_zero_std": 0.625, "grad_norm": 0.5914451479911804, "kl": 0.0553131103515625, "learning_rate": 1.288888888888889e-05, "loss": 0.0108, "num_tokens": 11729469.0, "reward": 0.974609375, "reward_std": 0.1817583441734314, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.615234375, "rewards/tag_count_reward/std": 0.36167582869529724, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3506.0, "completions/mean_length": 2753.296875, "completions/mean_terminated_length": 1922.59375, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.010583139522478503, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2599541246891022, "kl": 0.0503692626953125, "learning_rate": 1.3333333333333333e-05, "loss": 0.0188, "num_tokens": 12102867.0, "reward": 1.2109375, "reward_std": 0.14270445704460144, "rewards/accuracy_reward/mean": 0.2890625, "rewards/accuracy_reward/std": 0.45510825514793396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.3737676441669464, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 2955.53125, "completions/mean_terminated_length": 2037.0001220703125, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 0.01092453111997781, "frac_reward_zero_std": 0.65625, "grad_norm": 0.3193811774253845, "kl": 0.0529937744140625, "learning_rate": 1.377777777777778e-05, "loss": 0.0314, "num_tokens": 12501811.0, "reward": 0.8359375, "reward_std": 0.2647542655467987, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5703125, "rewards/tag_count_reward/std": 0.36712533235549927, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2929.875, "completions/mean_terminated_length": 2004.2264404296875, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.011265922717477115, "frac_reward_zero_std": 0.71875, "grad_norm": 0.31857308745384216, "kl": 0.0516357421875, "learning_rate": 1.4222222222222224e-05, "loss": 0.0151, "num_tokens": 12899299.0, "reward": 0.705078125, "reward_std": 0.13216298818588257, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.580078125, "rewards/tag_count_reward/std": 0.36712008714675903, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3582.0, "completions/mean_length": 2961.3828125, "completions/mean_terminated_length": 2185.842041015625, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.011607314314976422, "frac_reward_zero_std": 0.46875, "grad_norm": 0.38899877667427063, "kl": 0.0594482421875, "learning_rate": 1.4666666666666666e-05, "loss": 0.0361, "num_tokens": 13297952.0, "reward": 0.87109375, "reward_std": 0.35478463768959045, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.3680248558521271, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3582.0, "completions/mean_length": 2635.1640625, "completions/mean_terminated_length": 1686.328125, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.011948705912475729, "frac_reward_zero_std": 0.65625, "grad_norm": 0.37273454666137695, "kl": 0.084136962890625, "learning_rate": 1.5111111111111112e-05, "loss": 0.0162, "num_tokens": 13657037.0, "reward": 1.0625, "reward_std": 0.26114049553871155, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.36821284890174866, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3551.0, "completions/mean_length": 2814.6796875, "completions/mean_terminated_length": 1690.28857421875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.012290097509975036, "frac_reward_zero_std": 0.71875, "grad_norm": 0.26906415820121765, "kl": 0.07012939453125, "learning_rate": 1.555555555555556e-05, "loss": 0.0079, "num_tokens": 14039520.0, "reward": 0.7734375, "reward_std": 0.19364014267921448, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5546875, "rewards/tag_count_reward/std": 0.36578238010406494, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3505.0, "completions/mean_length": 2982.84375, "completions/mean_terminated_length": 2013.632568359375, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.012631489107474343, "frac_reward_zero_std": 0.5625, "grad_norm": 0.34450799226760864, "kl": 0.075714111328125, "learning_rate": 1.6000000000000003e-05, "loss": 0.0227, "num_tokens": 14441340.0, "reward": 0.822265625, "reward_std": 0.33261731266975403, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.525390625, "rewards/tag_count_reward/std": 0.35610511898994446, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3580.0, "completions/mean_length": 2755.4375, "completions/mean_terminated_length": 1952.3692626953125, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.012972880704973648, "frac_reward_zero_std": 0.53125, "grad_norm": 0.33996719121932983, "kl": 0.0821533203125, "learning_rate": 1.6444444444444444e-05, "loss": 0.0193, "num_tokens": 14815496.0, "reward": 0.83984375, "reward_std": 0.3050321936607361, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62109375, "rewards/tag_count_reward/std": 0.36718815565109253, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 2733.3984375, "completions/mean_terminated_length": 1958.9700927734375, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.013314272302472955, "frac_reward_zero_std": 0.5, "grad_norm": 0.7348785400390625, "kl": 0.084259033203125, "learning_rate": 1.688888888888889e-05, "loss": 0.027, "num_tokens": 15186127.0, "reward": 1.033203125, "reward_std": 0.49291110038757324, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.658203125, "rewards/tag_count_reward/std": 0.36970818042755127, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3419.0, "completions/mean_length": 2911.3125, "completions/mean_terminated_length": 2046.4287109375, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "epoch": 0.013655663899972262, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3535268008708954, "kl": 0.0902099609375, "learning_rate": 1.7333333333333336e-05, "loss": 0.0039, "num_tokens": 15580203.0, "reward": 0.75390625, "reward_std": 0.26393139362335205, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59765625, "rewards/tag_count_reward/std": 0.3688596487045288, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3560.0, "completions/mean_length": 2772.5, "completions/mean_terminated_length": 2100.1142578125, "completions/min_length": 1016.0, "completions/min_terminated_length": 1016.0, "epoch": 0.013997055497471569, "frac_reward_zero_std": 0.5, "grad_norm": 0.3534427583217621, "kl": 0.099151611328125, "learning_rate": 1.7777777777777777e-05, "loss": 0.0404, "num_tokens": 15954079.0, "reward": 1.04296875, "reward_std": 0.3417321443557739, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.3713528513908386, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3429.0, "completions/mean_length": 3021.7421875, "completions/mean_terminated_length": 1784.7750244140625, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "epoch": 0.014338447094970876, "frac_reward_zero_std": 0.71875, "grad_norm": 0.25263962149620056, "kl": 0.093353271484375, "learning_rate": 1.8222222222222224e-05, "loss": 0.0082, "num_tokens": 16363482.0, "reward": 0.703125, "reward_std": 0.22749368846416473, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.3493525981903076, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3515.0, "completions/mean_length": 3011.484375, "completions/mean_terminated_length": 2147.09814453125, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.01467983869247018, "frac_reward_zero_std": 0.5, "grad_norm": 0.28289416432380676, "kl": 0.10638427734375, "learning_rate": 1.866666666666667e-05, "loss": 0.0282, "num_tokens": 16768632.0, "reward": 0.853515625, "reward_std": 0.3670836091041565, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.572265625, "rewards/tag_count_reward/std": 0.3647245168685913, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3429.0, "completions/mean_length": 2918.8203125, "completions/mean_terminated_length": 2140.898193359375, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "epoch": 0.015021230289969488, "frac_reward_zero_std": 0.5, "grad_norm": 0.299098938703537, "kl": 0.113739013671875, "learning_rate": 1.9111111111111113e-05, "loss": 0.025, "num_tokens": 17161073.0, "reward": 1.177734375, "reward_std": 0.4010846018791199, "rewards/accuracy_reward/mean": 0.2734375, "rewards/accuracy_reward/std": 0.447474867105484, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.630859375, "rewards/tag_count_reward/std": 0.3644714057445526, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 2866.9921875, "completions/mean_terminated_length": 2103.725830078125, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.015362621887468795, "frac_reward_zero_std": 0.75, "grad_norm": 0.49819648265838623, "kl": 0.1131591796875, "learning_rate": 1.9555555555555557e-05, "loss": 0.0181, "num_tokens": 17549768.0, "reward": 0.8671875, "reward_std": 0.23004087805747986, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.3697965741157532, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3488.0, "completions/mean_length": 3063.171875, "completions/mean_terminated_length": 2276.82373046875, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.0157040134849681, "frac_reward_zero_std": 0.53125, "grad_norm": 0.29166948795318604, "kl": 0.110687255859375, "learning_rate": 2e-05, "loss": 0.031, "num_tokens": 17961746.0, "reward": 0.912109375, "reward_std": 0.3591154217720032, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.583984375, "rewards/tag_count_reward/std": 0.3621857166290283, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3470.0, "completions/mean_length": 3006.0390625, "completions/mean_terminated_length": 2188.169921875, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 0.016045405082467407, "frac_reward_zero_std": 0.59375, "grad_norm": 0.2663406729698181, "kl": 0.114898681640625, "learning_rate": 1.999969914479768e-05, "loss": 0.0254, "num_tokens": 18366755.0, "reward": 0.818359375, "reward_std": 0.23288872838020325, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.599609375, "rewards/tag_count_reward/std": 0.36227062344551086, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.15625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3474.0, "completions/mean_length": 3246.046875, "completions/mean_terminated_length": 2414.864990234375, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 0.016386796679966713, "frac_reward_zero_std": 0.5625, "grad_norm": 0.26411378383636475, "kl": 0.120269775390625, "learning_rate": 1.9998796597293477e-05, "loss": 0.0402, "num_tokens": 18803705.0, "reward": 0.720703125, "reward_std": 0.17609117925167084, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.533203125, "rewards/tag_count_reward/std": 0.3413311839103699, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 3020.3046875, "completions/mean_terminated_length": 2080.8125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.01672818827746602, "frac_reward_zero_std": 0.53125, "grad_norm": 0.2807792127132416, "kl": 0.127410888671875, "learning_rate": 1.999729241179462e-05, "loss": 0.0186, "num_tokens": 19209932.0, "reward": 0.931640625, "reward_std": 0.2661032974720001, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.587890625, "rewards/tag_count_reward/std": 0.3529820144176483, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3544.0, "completions/mean_length": 2931.1875, "completions/mean_terminated_length": 1878.69384765625, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.017069579874965327, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2702789902687073, "kl": 0.152557373046875, "learning_rate": 1.9995186678809513e-05, "loss": 0.0181, "num_tokens": 19603420.0, "reward": 1.0078125, "reward_std": 0.3685213327407837, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5703125, "rewards/tag_count_reward/std": 0.358992338180542, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3493.0, "completions/mean_length": 2966.4921875, "completions/mean_terminated_length": 2197.31591796875, "completions/min_length": 1065.0, "completions/min_terminated_length": 1065.0, "epoch": 0.017410971472464634, "frac_reward_zero_std": 0.5, "grad_norm": 0.25847598910331726, "kl": 0.15814208984375, "learning_rate": 1.9992479525042305e-05, "loss": 0.0225, "num_tokens": 20004219.0, "reward": 1.134765625, "reward_std": 0.3639649748802185, "rewards/accuracy_reward/mean": 0.2578125, "rewards/accuracy_reward/std": 0.43914902210235596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.619140625, "rewards/tag_count_reward/std": 0.36176085472106934, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 2834.7109375, "completions/mean_terminated_length": 2037.08056640625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.01775236306996394, "frac_reward_zero_std": 0.65625, "grad_norm": 0.2311725914478302, "kl": 0.164306640625, "learning_rate": 1.998917111338525e-05, "loss": 0.0101, "num_tokens": 20387922.0, "reward": 0.982421875, "reward_std": 0.22784006595611572, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.638671875, "rewards/tag_count_reward/std": 0.362906813621521, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3558.0, "completions/mean_length": 2797.875, "completions/mean_terminated_length": 2186.444580078125, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 0.018093754667463248, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3029285967350006, "kl": 0.18310546875, "learning_rate": 1.9985261642908917e-05, "loss": 0.0271, "num_tokens": 20765870.0, "reward": 1.2890625, "reward_std": 0.3925495445728302, "rewards/accuracy_reward/mean": 0.2890625, "rewards/accuracy_reward/std": 0.45510825514793396, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7109375, "rewards/tag_count_reward/std": 0.35276955366134644, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3544.0, "completions/mean_length": 2678.46875, "completions/mean_terminated_length": 2038.56005859375, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.018435146264962555, "frac_reward_zero_std": 0.625, "grad_norm": 0.2561197280883789, "kl": 0.2044677734375, "learning_rate": 1.998075134885022e-05, "loss": 0.0224, "num_tokens": 21130746.0, "reward": 1.078125, "reward_std": 0.2616530954837799, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.35215869545936584, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 2748.2265625, "completions/mean_terminated_length": 2263.271728515625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "epoch": 0.018776537862461862, "frac_reward_zero_std": 0.46875, "grad_norm": 0.30205026268959045, "kl": 0.18109130859375, "learning_rate": 1.9975640502598243e-05, "loss": 0.0089, "num_tokens": 21502015.0, "reward": 1.1796875, "reward_std": 0.47462642192840576, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7578125, "rewards/tag_count_reward/std": 0.33340510725975037, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 2414.0859375, "completions/mean_terminated_length": 1639.207763671875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "epoch": 0.019117929459961165, "frac_reward_zero_std": 0.53125, "grad_norm": 0.32271432876586914, "kl": 0.21636962890625, "learning_rate": 1.996992941167792e-05, "loss": 0.0201, "num_tokens": 21831598.0, "reward": 1.09375, "reward_std": 0.30657491087913513, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.71875, "rewards/tag_count_reward/std": 0.35215869545936584, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3400.0, "completions/mean_length": 2939.4375, "completions/mean_terminated_length": 2274.412841796875, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "epoch": 0.019459321057460472, "frac_reward_zero_std": 0.34375, "grad_norm": 0.3238721489906311, "kl": 0.19854736328125, "learning_rate": 1.996361841973154e-05, "loss": 0.039, "num_tokens": 22228606.0, "reward": 0.87890625, "reward_std": 0.37155014276504517, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66015625, "rewards/tag_count_reward/std": 0.3587137758731842, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3574.0, "completions/mean_length": 2887.703125, "completions/mean_terminated_length": 2253.76123046875, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.01980071265495978, "frac_reward_zero_std": 0.5, "grad_norm": 0.2821209728717804, "kl": 0.179931640625, "learning_rate": 1.9956707906498046e-05, "loss": 0.0233, "num_tokens": 22622596.0, "reward": 1.060546875, "reward_std": 0.35672104358673096, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.669921875, "rewards/tag_count_reward/std": 0.36442920565605164, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2893.703125, "completions/mean_terminated_length": 1977.4908447265625, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "epoch": 0.020142104252459086, "frac_reward_zero_std": 0.5625, "grad_norm": 0.24977393448352814, "kl": 0.1729736328125, "learning_rate": 1.9949198287790215e-05, "loss": 0.0473, "num_tokens": 23011378.0, "reward": 1.05078125, "reward_std": 0.36624208092689514, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59765625, "rewards/tag_count_reward/std": 0.3648351728916168, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3506.0, "completions/mean_length": 2584.3828125, "completions/mean_terminated_length": 1729.6376953125, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 0.020483495849958393, "frac_reward_zero_std": 0.5625, "grad_norm": 0.27603334188461304, "kl": 0.199462890625, "learning_rate": 1.9941090015469614e-05, "loss": 0.0288, "num_tokens": 23364315.0, "reward": 0.9609375, "reward_std": 0.2518423795700073, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.36510905623435974, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3574.0, "completions/mean_length": 2836.578125, "completions/mean_terminated_length": 2089.15625, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 0.0208248874474577, "frac_reward_zero_std": 0.53125, "grad_norm": 0.308930903673172, "kl": 0.1788330078125, "learning_rate": 1.9932383577419432e-05, "loss": 0.0173, "num_tokens": 23747209.0, "reward": 1.037109375, "reward_std": 0.36217206716537476, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.662109375, "rewards/tag_count_reward/std": 0.35988548398017883, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3520.0, "completions/mean_length": 2551.140625, "completions/mean_terminated_length": 1931.425048828125, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.021166279044957007, "frac_reward_zero_std": 0.5, "grad_norm": 0.3429572284221649, "kl": 0.196044921875, "learning_rate": 1.99230794975151e-05, "loss": 0.0247, "num_tokens": 24093515.0, "reward": 1.005859375, "reward_std": 0.29658064246177673, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.724609375, "rewards/tag_count_reward/std": 0.3588584065437317, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3508.0, "completions/mean_length": 2789.7265625, "completions/mean_terminated_length": 1944.2095947265625, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 0.021507670642456313, "frac_reward_zero_std": 0.5625, "grad_norm": 0.25445467233657837, "kl": 0.18499755859375, "learning_rate": 1.9913178335592784e-05, "loss": 0.0279, "num_tokens": 24472644.0, "reward": 0.9296875, "reward_std": 0.31241774559020996, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.36578238010406494, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3574.0, "completions/mean_length": 2430.5625, "completions/mean_terminated_length": 1867.255859375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.02184906223995562, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3254539370536804, "kl": 0.19183349609375, "learning_rate": 1.9902680687415704e-05, "loss": 0.0226, "num_tokens": 24804768.0, "reward": 1.275390625, "reward_std": 0.4635465741157532, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.775390625, "rewards/tag_count_reward/std": 0.33473435044288635, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3526.0, "completions/mean_length": 2674.2890625, "completions/mean_terminated_length": 1735.698486328125, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.022190453837454927, "frac_reward_zero_std": 0.6875, "grad_norm": 0.24796949326992035, "kl": 0.1805419921875, "learning_rate": 1.9891587184638274e-05, "loss": 0.0179, "num_tokens": 25168389.0, "reward": 0.896484375, "reward_std": 0.2598833441734314, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.630859375, "rewards/tag_count_reward/std": 0.37248480319976807, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3570.0, "completions/mean_length": 2785.6796875, "completions/mean_terminated_length": 1987.359375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.02253184543495423, "frac_reward_zero_std": 0.625, "grad_norm": 0.26873698830604553, "kl": 0.1845703125, "learning_rate": 1.9879898494768093e-05, "loss": 0.0273, "num_tokens": 25545572.0, "reward": 0.86328125, "reward_std": 0.2120647430419922, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.64453125, "rewards/tag_count_reward/std": 0.3666852116584778, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3540.0, "completions/mean_length": 3077.3359375, "completions/mean_terminated_length": 2232.89599609375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.022873237032453538, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2997102737426758, "kl": 0.17413330078125, "learning_rate": 1.9867615321125796e-05, "loss": 0.0372, "num_tokens": 25957867.0, "reward": 0.8984375, "reward_std": 0.3808963894844055, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5546875, "rewards/tag_count_reward/std": 0.358992338180542, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3508.0, "completions/mean_length": 3137.640625, "completions/mean_terminated_length": 2255.30224609375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.023214628629952844, "frac_reward_zero_std": 0.5625, "grad_norm": 0.27699077129364014, "kl": 0.17657470703125, "learning_rate": 1.9854738402802715e-05, "loss": 0.0199, "num_tokens": 26380089.0, "reward": 0.84765625, "reward_std": 0.39581140875816345, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.51953125, "rewards/tag_count_reward/std": 0.35300925374031067, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 3049.328125, "completions/mean_terminated_length": 2187.30615234375, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 0.02355602022745215, "frac_reward_zero_std": 0.65625, "grad_norm": 0.22677746415138245, "kl": 0.17987060546875, "learning_rate": 1.9841268514616434e-05, "loss": 0.0268, "num_tokens": 26792787.0, "reward": 0.599609375, "reward_std": 0.1674078404903412, "rewards/accuracy_reward/mean": 0.032258063554763794, "rewards/accuracy_reward/std": 0.17740146815776825, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.537109375, "rewards/tag_count_reward/std": 0.36193084716796875, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3571.0, "completions/mean_length": 2859.203125, "completions/mean_terminated_length": 2111.39697265625, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 0.023897411824951458, "frac_reward_zero_std": 0.46875, "grad_norm": 0.6092619895935059, "kl": 0.1851806640625, "learning_rate": 1.9827206467064133e-05, "loss": 0.0383, "num_tokens": 27179705.0, "reward": 0.953125, "reward_std": 0.38114699721336365, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.625, "rewards/tag_count_reward/std": 0.3725312352180481, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3447.0, "completions/mean_length": 2699.9296875, "completions/mean_terminated_length": 1758.822509765625, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.024238803422450765, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2147958129644394, "kl": 0.18780517578125, "learning_rate": 1.9812553106273848e-05, "loss": 0.0268, "num_tokens": 27545608.0, "reward": 0.755859375, "reward_std": 0.1977209448814392, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.615234375, "rewards/tag_count_reward/std": 0.37107837200164795, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3383.0, "completions/mean_length": 2849.2421875, "completions/mean_terminated_length": 1989.9490966796875, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.024580195019950072, "frac_reward_zero_std": 0.6875, "grad_norm": 0.24814502894878387, "kl": 0.19287109375, "learning_rate": 1.979730931395354e-05, "loss": 0.0121, "num_tokens": 27933123.0, "reward": 0.763671875, "reward_std": 0.2560723125934601, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.607421875, "rewards/tag_count_reward/std": 0.37211301922798157, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3512.0, "completions/mean_length": 2743.640625, "completions/mean_terminated_length": 1628.2545166015625, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "epoch": 0.02492158661744938, "frac_reward_zero_std": 0.625, "grad_norm": 0.2367912083864212, "kl": 0.19189453125, "learning_rate": 1.9781476007338058e-05, "loss": 0.0202, "num_tokens": 28307177.0, "reward": 0.728515625, "reward_std": 0.1887519806623459, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.587890625, "rewards/tag_count_reward/std": 0.3653143644332886, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 2701.0703125, "completions/mean_terminated_length": 1946.1014404296875, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.025262978214948686, "frac_reward_zero_std": 0.625, "grad_norm": 0.2576083838939667, "kl": 0.201171875, "learning_rate": 1.976505413913393e-05, "loss": 0.007, "num_tokens": 28672662.0, "reward": 1.130859375, "reward_std": 0.27597564458847046, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.662109375, "rewards/tag_count_reward/std": 0.37198901176452637, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3571.0, "completions/mean_length": 3048.6015625, "completions/mean_terminated_length": 2360.232177734375, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.025604369812447993, "frac_reward_zero_std": 0.46875, "grad_norm": 0.2625581920146942, "kl": 0.19500732421875, "learning_rate": 1.974804469746206e-05, "loss": 0.0363, "num_tokens": 29084163.0, "reward": 0.9453125, "reward_std": 0.4019372761249542, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6015625, "rewards/tag_count_reward/std": 0.3677949607372284, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3468.0, "completions/mean_length": 2815.8515625, "completions/mean_terminated_length": 1763.2037353515625, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 0.025945761409947296, "frac_reward_zero_std": 0.6875, "grad_norm": 0.23508375883102417, "kl": 0.2017822265625, "learning_rate": 1.973044870579824e-05, "loss": 0.0335, "num_tokens": 29463292.0, "reward": 0.953125, "reward_std": 0.2533228099346161, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.578125, "rewards/tag_count_reward/std": 0.36954694986343384, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3580.0, "completions/mean_length": 3096.671875, "completions/mean_terminated_length": 2133.348876953125, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 0.026287153007446603, "frac_reward_zero_std": 0.65625, "grad_norm": 0.21940624713897705, "kl": 0.1966552734375, "learning_rate": 1.9712267222911605e-05, "loss": 0.0132, "num_tokens": 29880322.0, "reward": 0.701171875, "reward_std": 0.1592055857181549, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.513671875, "rewards/tag_count_reward/std": 0.3553701937198639, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3368.0, "completions/mean_length": 2755.234375, "completions/mean_terminated_length": 1844.95068359375, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.02662854460494591, "frac_reward_zero_std": 0.65625, "grad_norm": 17.97633934020996, "kl": 0.3902587890625, "learning_rate": 1.9693501342800895e-05, "loss": 0.0455, "num_tokens": 30254560.0, "reward": 0.849609375, "reward_std": 0.21911722421646118, "rewards/accuracy_reward/mean": 0.12096773833036423, "rewards/accuracy_reward/std": 0.3274126350879669, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.615234375, "rewards/tag_count_reward/std": 0.3737213611602783, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 3054.4609375, "completions/mean_terminated_length": 2171.89599609375, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "epoch": 0.026969936202445217, "frac_reward_zero_std": 0.625, "grad_norm": 0.27303701639175415, "kl": 0.18377685546875, "learning_rate": 1.967415219462864e-05, "loss": 0.0297, "num_tokens": 30666967.0, "reward": 0.6953125, "reward_std": 0.2459690272808075, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5390625, "rewards/tag_count_reward/std": 0.3637586832046509, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3527.0, "completions/mean_length": 2677.09375, "completions/mean_terminated_length": 1901.6231689453125, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.027311327799944524, "frac_reward_zero_std": 0.625, "grad_norm": 3.9623684883117676, "kl": 0.2177734375, "learning_rate": 1.9654220942653223e-05, "loss": 0.0397, "num_tokens": 31031959.0, "reward": 0.810546875, "reward_std": 0.2846389710903168, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.669921875, "rewards/tag_count_reward/std": 0.3684581518173218, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2527.203125, "completions/mean_terminated_length": 1651.5714111328125, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.02765271939744383, "frac_reward_zero_std": 0.53125, "grad_norm": 0.2825882136821747, "kl": 0.18914794921875, "learning_rate": 1.9633708786158803e-05, "loss": 0.0318, "num_tokens": 31374681.0, "reward": 1.0390625, "reward_std": 0.2190738320350647, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6796875, "rewards/tag_count_reward/std": 0.36578238010406494, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3373.0, "completions/mean_length": 3017.0234375, "completions/mean_terminated_length": 2288.0537109375, "completions/min_length": 1277.0, "completions/min_terminated_length": 1277.0, "epoch": 0.027994110994943137, "frac_reward_zero_std": 0.46875, "grad_norm": 0.27044257521629333, "kl": 0.1844482421875, "learning_rate": 1.961261695938319e-05, "loss": 0.0373, "num_tokens": 31781992.0, "reward": 1.1015625, "reward_std": 0.42221954464912415, "rewards/accuracy_reward/mean": 0.2578125, "rewards/accuracy_reward/std": 0.43914902210235596, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5859375, "rewards/tag_count_reward/std": 0.371787428855896, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 2989.59375, "completions/mean_terminated_length": 2356.838623046875, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 0.028335502592442444, "frac_reward_zero_std": 0.53125, "grad_norm": 0.32531461119651794, "kl": 0.20086669921875, "learning_rate": 1.959094673144354e-05, "loss": 0.0197, "num_tokens": 32184476.0, "reward": 0.943359375, "reward_std": 0.2779914438724518, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.615234375, "rewards/tag_count_reward/std": 0.3737213611602783, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3405.0, "completions/mean_length": 2775.34375, "completions/mean_terminated_length": 1991.5692138671875, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.02867689418994175, "frac_reward_zero_std": 0.59375, "grad_norm": 0.2808055877685547, "kl": 0.1846923828125, "learning_rate": 1.9568699406260016e-05, "loss": 0.0396, "num_tokens": 32559660.0, "reward": 0.892578125, "reward_std": 0.29128602147102356, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.642578125, "rewards/tag_count_reward/std": 0.37211301922798157, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 2779.953125, "completions/mean_terminated_length": 1924.0322265625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.029018285787441058, "frac_reward_zero_std": 0.59375, "grad_norm": 0.26734375953674316, "kl": 0.19775390625, "learning_rate": 1.954587632247732e-05, "loss": 0.028, "num_tokens": 32935734.0, "reward": 0.90625, "reward_std": 0.2388405054807663, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.625, "rewards/tag_count_reward/std": 0.3725312352180481, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3482.0, "completions/mean_length": 2945.0, "completions/mean_terminated_length": 2040.7547607421875, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 0.02935967738494036, "frac_reward_zero_std": 0.53125, "grad_norm": 0.27204370498657227, "kl": 0.194091796875, "learning_rate": 1.9522478853384154e-05, "loss": 0.0171, "num_tokens": 33335282.0, "reward": 0.875, "reward_std": 0.35471218824386597, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.578125, "rewards/tag_count_reward/std": 0.36146846413612366, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 2843.0390625, "completions/mean_terminated_length": 2078.5556640625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.02970106898243967, "frac_reward_zero_std": 0.75, "grad_norm": 420.11358642578125, "kl": 2.243408203125, "learning_rate": 1.9498508406830577e-05, "loss": 0.0992, "num_tokens": 33720123.0, "reward": 0.791015625, "reward_std": 0.20558558404445648, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.619140625, "rewards/tag_count_reward/std": 0.3764275014400482, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 3069.3359375, "completions/mean_terminated_length": 2266.4599609375, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.030042460579938975, "frac_reward_zero_std": 0.65625, "grad_norm": 0.29481202363967896, "kl": 0.26953125, "learning_rate": 1.9473966425143292e-05, "loss": 0.0337, "num_tokens": 34136014.0, "reward": 0.90625, "reward_std": 0.26554781198501587, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5625, "rewards/tag_count_reward/std": 0.3645188808441162, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3477.0, "completions/mean_length": 2896.96875, "completions/mean_terminated_length": 2379.342529296875, "completions/min_length": 1134.0, "completions/min_terminated_length": 1134.0, "epoch": 0.030383852177438282, "frac_reward_zero_std": 0.59375, "grad_norm": 0.22812406718730927, "kl": 0.20965576171875, "learning_rate": 1.944885438503888e-05, "loss": 0.0353, "num_tokens": 34526518.0, "reward": 0.96484375, "reward_std": 0.28904032707214355, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.68359375, "rewards/tag_count_reward/std": 0.3678576648235321, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3503.0, "completions/mean_length": 2948.734375, "completions/mean_terminated_length": 2157.438720703125, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 0.03072524377493759, "frac_reward_zero_std": 0.5, "grad_norm": 0.2590017318725586, "kl": 0.22015380859375, "learning_rate": 1.9423173797534924e-05, "loss": 0.0474, "num_tokens": 34924000.0, "reward": 0.861328125, "reward_std": 0.3714698553085327, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.611328125, "rewards/tag_count_reward/std": 0.366952508687973, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3523.0, "completions/mean_length": 3006.46875, "completions/mean_terminated_length": 2287.087646484375, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.031066635372436896, "frac_reward_zero_std": 0.375, "grad_norm": 0.32937249541282654, "kl": 0.214111328125, "learning_rate": 1.9396926207859085e-05, "loss": 0.0362, "num_tokens": 35328932.0, "reward": 0.806640625, "reward_std": 0.3674401044845581, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.587890625, "rewards/tag_count_reward/std": 0.37066370248794556, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 3139.0703125, "completions/mean_terminated_length": 2194.951171875, "completions/min_length": 1222.0, "completions/min_terminated_length": 1222.0, "epoch": 0.0314080269699362, "frac_reward_zero_std": 0.625, "grad_norm": 1.630729079246521, "kl": 0.22186279296875, "learning_rate": 1.937011319535615e-05, "loss": 0.0456, "num_tokens": 35753129.0, "reward": 0.666015625, "reward_std": 0.29503291845321655, "rewards/accuracy_reward/mean": 0.09166666865348816, "rewards/accuracy_reward/std": 0.2897646725177765, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.494140625, "rewards/tag_count_reward/std": 0.35141006112098694, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.28125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3497.0, "completions/mean_length": 2690.9375, "completions/mean_terminated_length": 2018.0821533203125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.03174941856743551, "frac_reward_zero_std": 0.59375, "grad_norm": 0.28304898738861084, "kl": 0.23199462890625, "learning_rate": 1.9342736373392976e-05, "loss": 0.0294, "num_tokens": 36116133.0, "reward": 0.97265625, "reward_std": 0.2264118194580078, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.67578125, "rewards/tag_count_reward/std": 0.36634954810142517, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3543.0, "completions/mean_length": 2767.40625, "completions/mean_terminated_length": 1975.9384765625, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.03209081016493481, "frac_reward_zero_std": 0.5625, "grad_norm": 0.27265122532844543, "kl": 0.23809814453125, "learning_rate": 1.9314797389261426e-05, "loss": 0.0323, "num_tokens": 36491057.0, "reward": 0.826171875, "reward_std": 0.340867280960083, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.638671875, "rewards/tag_count_reward/std": 0.3735978901386261, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 2761.1484375, "completions/mean_terminated_length": 1798.83056640625, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.032432201762434124, "frac_reward_zero_std": 0.59375, "grad_norm": 0.24852459132671356, "kl": 0.23797607421875, "learning_rate": 1.9286297924079244e-05, "loss": 0.0024, "num_tokens": 36866160.0, "reward": 0.87890625, "reward_std": 0.2873074412345886, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59765625, "rewards/tag_count_reward/std": 0.3728407323360443, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2743.1484375, "completions/mean_terminated_length": 2068.0986328125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.03277359335993343, "frac_reward_zero_std": 0.59375, "grad_norm": 0.2417757362127304, "kl": 0.2374267578125, "learning_rate": 1.9257239692688907e-05, "loss": 0.0346, "num_tokens": 37236019.0, "reward": 1.001953125, "reward_std": 0.3302620053291321, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.673828125, "rewards/tag_count_reward/std": 0.3706222176551819, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3550.0, "completions/mean_length": 2526.59375, "completions/mean_terminated_length": 1803.105224609375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.03311498495743274, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3100202977657318, "kl": 0.239990234375, "learning_rate": 1.9227624443554425e-05, "loss": 0.0359, "num_tokens": 37579715.0, "reward": 0.98828125, "reward_std": 0.4097243547439575, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.69140625, "rewards/tag_count_reward/std": 0.36651739478111267, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3508.0, "completions/mean_length": 2857.484375, "completions/mean_terminated_length": 1952.5263671875, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.03345637655493204, "frac_reward_zero_std": 0.53125, "grad_norm": 0.27218031883239746, "kl": 0.2366943359375, "learning_rate": 1.9197453958656157e-05, "loss": 0.0406, "num_tokens": 37966977.0, "reward": 0.814453125, "reward_std": 0.3187915086746216, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.595703125, "rewards/tag_count_reward/std": 0.3713683784008026, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3570.0, "completions/mean_length": 2589.2734375, "completions/mean_terminated_length": 2031.2559814453125, "completions/min_length": 667.0, "completions/min_terminated_length": 667.0, "epoch": 0.03379776815243135, "frac_reward_zero_std": 0.59375, "grad_norm": 0.27328935265541077, "kl": 0.23333740234375, "learning_rate": 1.916673005338357e-05, "loss": 0.0224, "num_tokens": 38318904.0, "reward": 0.990234375, "reward_std": 0.21185174584388733, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.740234375, "rewards/tag_count_reward/std": 0.3527204990386963, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3571.0, "completions/mean_length": 2528.6953125, "completions/mean_terminated_length": 1782.9466552734375, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 0.034139159749930655, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3132285177707672, "kl": 0.2442626953125, "learning_rate": 1.913545457642601e-05, "loss": 0.0381, "num_tokens": 38663141.0, "reward": 0.943359375, "reward_std": 0.38466405868530273, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.693359375, "rewards/tag_count_reward/std": 0.35662299394607544, "step": 100 } ], "logging_steps": 1, "max_steps": 450, "num_input_tokens_seen": 38663141, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }