{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05714285714285714, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 1702.03125, "completions/mean_terminated_length": 993.6190795898438, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.001142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2544386684894562, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 118418.0, "reward": 0.17899775505065918, "reward_std": 0.7650213241577148, "rewards/cosine_scaled_reward/mean": -0.09800112992525101, "rewards/cosine_scaled_reward/std": 0.37953105568885803, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.48795005679130554, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 1738.90625, "completions/mean_terminated_length": 949.0, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.002285714285714286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2436082512140274, "learning_rate": 5e-08, "loss": -0.0, "num_tokens": 239748.0, "reward": 0.3848632574081421, "reward_std": 0.9111153483390808, "rewards/cosine_scaled_reward/mean": 0.020556632429361343, "rewards/cosine_scaled_reward/std": 0.4492928683757782, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4787135720252991, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 1989.015625, "completions/mean_terminated_length": 1104.25, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.0034285714285714284, "frac_reward_zero_std": 0.0, "grad_norm": 0.2544717788696289, "learning_rate": 1e-07, "loss": -0.0, "num_tokens": 377517.0, "reward": -0.3279358148574829, "reward_std": 0.33216947317123413, "rewards/cosine_scaled_reward/mean": -0.20303040742874146, "rewards/cosine_scaled_reward/std": 0.179075226187706, "rewards/format_reward/mean": 0.078125, "rewards/format_reward/std": 0.27048972249031067, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1566.421875, "completions/mean_terminated_length": 1084.84375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.004571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.28807103633880615, "learning_rate": 1.5e-07, "loss": -0.0, "num_tokens": 487576.0, "reward": 0.2716121971607208, "reward_std": 0.6643469333648682, "rewards/cosine_scaled_reward/mean": -0.12981891632080078, "rewards/cosine_scaled_reward/std": 0.3019586503505707, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5029674172401428, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 1936.84375, "completions/mean_terminated_length": 1031.71435546875, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.005714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.26783761382102966, "learning_rate": 2e-07, "loss": -0.0, "num_tokens": 622350.0, "reward": -0.3612896800041199, "reward_std": 0.41048353910446167, "rewards/cosine_scaled_reward/mean": -0.23533234000205994, "rewards/cosine_scaled_reward/std": 0.20467400550842285, "rewards/format_reward/mean": 0.109375, "rewards/format_reward/std": 0.3145764470100403, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 1889.453125, "completions/mean_terminated_length": 779.625, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "epoch": 0.006857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.262518972158432, "learning_rate": 2.5e-07, "loss": 0.0, "num_tokens": 754923.0, "reward": -0.29250282049179077, "reward_std": 0.5422531962394714, "rewards/cosine_scaled_reward/mean": -0.22437641024589539, "rewards/cosine_scaled_reward/std": 0.22509199380874634, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.36596253514289856, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 1921.921875, "completions/mean_terminated_length": 1314.45458984375, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.22601397335529327, "learning_rate": 3e-07, "loss": 0.0, "num_tokens": 888334.0, "reward": 0.025340259075164795, "reward_std": 0.7285393476486206, "rewards/cosine_scaled_reward/mean": -0.1279548704624176, "rewards/cosine_scaled_reward/std": 0.40222346782684326, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4531635046005249, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1736.859375, "completions/mean_terminated_length": 999.9473876953125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.009142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.24552854895591736, "learning_rate": 3.5e-07, "loss": 0.0, "num_tokens": 1009909.0, "reward": 0.21729671955108643, "reward_std": 0.6989120244979858, "rewards/cosine_scaled_reward/mean": -0.055414143949747086, "rewards/cosine_scaled_reward/std": 0.47493892908096313, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.4732423722743988, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1967.53125, "completions/mean_terminated_length": 1475.77783203125, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 0.010285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.2430322915315628, "learning_rate": 4e-07, "loss": 0.0, "num_tokens": 1147287.0, "reward": -0.21451422572135925, "reward_std": 0.587526798248291, "rewards/cosine_scaled_reward/mean": -0.19319462776184082, "rewards/cosine_scaled_reward/std": 0.29357606172561646, "rewards/format_reward/mean": 0.171875, "rewards/format_reward/std": 0.38025420904159546, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1708.546875, "completions/mean_terminated_length": 961.75, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.011428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.2543582320213318, "learning_rate": 4.5e-07, "loss": 0.0, "num_tokens": 1267466.0, "reward": 0.02539752423763275, "reward_std": 0.545810341835022, "rewards/cosine_scaled_reward/mean": -0.14355123043060303, "rewards/cosine_scaled_reward/std": 0.36147356033325195, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.467176616191864, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 1967.734375, "completions/mean_terminated_length": 1191.8333740234375, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.012571428571428572, "frac_reward_zero_std": 0.0, "grad_norm": 0.24583907425403595, "learning_rate": 5e-07, "loss": -0.0, "num_tokens": 1405073.0, "reward": -0.46971434354782104, "reward_std": 0.36104393005371094, "rewards/cosine_scaled_reward/mean": -0.28173214197158813, "rewards/cosine_scaled_reward/std": 0.17775526642799377, "rewards/format_reward/mean": 0.09375, "rewards/format_reward/std": 0.29378482699394226, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 1707.5625, "completions/mean_terminated_length": 1176.47998046875, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.013714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.3135142922401428, "learning_rate": 5.5e-07, "loss": -0.0, "num_tokens": 1525301.0, "reward": 0.0018395520746707916, "reward_std": 0.7012988328933716, "rewards/cosine_scaled_reward/mean": -0.21783021092414856, "rewards/cosine_scaled_reward/std": 0.324150949716568, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 1841.96875, "completions/mean_terminated_length": 1168.933349609375, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.014857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2532394826412201, "learning_rate": 6e-07, "loss": -0.0, "num_tokens": 1654227.0, "reward": -0.10322706401348114, "reward_std": 0.6915165185928345, "rewards/cosine_scaled_reward/mean": -0.17661353945732117, "rewards/cosine_scaled_reward/std": 0.329875111579895, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1816.390625, "completions/mean_terminated_length": 1306.8499755859375, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.28405147790908813, "learning_rate": 6.5e-07, "loss": 0.0, "num_tokens": 1781084.0, "reward": 0.10602855682373047, "reward_std": 0.630502462387085, "rewards/cosine_scaled_reward/mean": -0.11104822158813477, "rewards/cosine_scaled_reward/std": 0.3846627473831177, "rewards/format_reward/mean": 0.328125, "rewards/format_reward/std": 0.4732423722743988, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 1702.109375, "completions/mean_terminated_length": 818.1666870117188, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.017142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.28779250383377075, "learning_rate": 7e-07, "loss": 0.0, "num_tokens": 1900939.0, "reward": 0.32734519243240356, "reward_std": 0.3870265483856201, "rewards/cosine_scaled_reward/mean": 0.007422588765621185, "rewards/cosine_scaled_reward/std": 0.45787373185157776, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.467176616191864, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.018285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.2337152510881424, "learning_rate": 7.5e-07, "loss": -0.0, "num_tokens": 2042451.0, "reward": -0.5429925918579102, "reward_std": 0.3153150975704193, "rewards/cosine_scaled_reward/mean": -0.2714962661266327, "rewards/cosine_scaled_reward/std": 0.1678173691034317, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 1564.921875, "completions/mean_terminated_length": 858.8846435546875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.019428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.33599403500556946, "learning_rate": 8e-07, "loss": -0.0, "num_tokens": 2153126.0, "reward": 0.17696775496006012, "reward_std": 0.6489306688308716, "rewards/cosine_scaled_reward/mean": -0.11464111506938934, "rewards/cosine_scaled_reward/std": 0.3551919758319855, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.49501484632492065, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 1795.390625, "completions/mean_terminated_length": 893.21435546875, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.02057142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.22697053849697113, "learning_rate": 8.499999999999999e-07, "loss": -0.0, "num_tokens": 2278407.0, "reward": -0.10711958259344101, "reward_std": 0.5238703489303589, "rewards/cosine_scaled_reward/mean": -0.1785597801208496, "rewards/cosine_scaled_reward/std": 0.2545098662376404, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 1921.484375, "completions/mean_terminated_length": 1238.300048828125, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 0.021714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23972108960151672, "learning_rate": 9e-07, "loss": 0.0, "num_tokens": 2412638.0, "reward": 0.029344379901885986, "reward_std": 0.6719281077384949, "rewards/cosine_scaled_reward/mean": -0.086890310049057, "rewards/cosine_scaled_reward/std": 0.40220555663108826, "rewards/format_reward/mean": 0.203125, "rewards/format_reward/std": 0.40550529956817627, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 1728.5625, "completions/mean_terminated_length": 845.4117431640625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.022857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.23309311270713806, "learning_rate": 9.499999999999999e-07, "loss": 0.0, "num_tokens": 2534618.0, "reward": 0.0131673663854599, "reward_std": 0.4436222314834595, "rewards/cosine_scaled_reward/mean": -0.13404130935668945, "rewards/cosine_scaled_reward/std": 0.32819250226020813, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4531635046005249, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.71875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 1777.953125, "completions/mean_terminated_length": 1087.8333740234375, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.29990270733833313, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 2659215.0, "reward": -0.1764472872018814, "reward_std": 0.5121938586235046, "rewards/cosine_scaled_reward/mean": -0.2444736361503601, "rewards/cosine_scaled_reward/std": 0.289971262216568, "rewards/format_reward/mean": 0.3125, "rewards/format_reward/std": 0.467176616191864, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 1361.28125, "completions/mean_terminated_length": 921.0769653320312, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.025142857142857144, "frac_reward_zero_std": 0.0, "grad_norm": 0.29922786355018616, "learning_rate": 9.99931462820376e-07, "loss": -0.0, "num_tokens": 2755353.0, "reward": 0.6089149713516235, "reward_std": 0.5986809730529785, "rewards/cosine_scaled_reward/mean": -0.05491749942302704, "rewards/cosine_scaled_reward/std": 0.39076483249664307, "rewards/format_reward/mean": 0.71875, "rewards/format_reward/std": 0.4531635046005249, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.578125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1565.046875, "completions/mean_terminated_length": 903.2222290039062, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.026285714285714287, "frac_reward_zero_std": 0.0, "grad_norm": 0.27512773871421814, "learning_rate": 9.997258721585931e-07, "loss": -0.0, "num_tokens": 2866308.0, "reward": 0.21871733665466309, "reward_std": 0.5976030826568604, "rewards/cosine_scaled_reward/mean": -0.10157884657382965, "rewards/cosine_scaled_reward/std": 0.3856185972690582, "rewards/format_reward/mean": 0.421875, "rewards/format_reward/std": 0.49776285886764526, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 1801.671875, "completions/mean_terminated_length": 1259.75, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "epoch": 0.027428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.22642865777015686, "learning_rate": 9.993832906395582e-07, "loss": -0.0, "num_tokens": 2992543.0, "reward": 0.04899948835372925, "reward_std": 0.8525694608688354, "rewards/cosine_scaled_reward/mean": -0.17081275582313538, "rewards/cosine_scaled_reward/std": 0.3993513882160187, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.4917473793029785, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1715.765625, "completions/mean_terminated_length": 1035.4761962890625, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.02857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.25316134095191956, "learning_rate": 9.989038226169207e-07, "loss": -0.0, "num_tokens": 3112648.0, "reward": 0.10585837811231613, "reward_std": 0.7828943729400635, "rewards/cosine_scaled_reward/mean": -0.11894579976797104, "rewards/cosine_scaled_reward/std": 0.4141720235347748, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4787135720252991, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1917.703125, "completions/mean_terminated_length": 1452.357177734375, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 0.029714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.2521306574344635, "learning_rate": 9.982876141412855e-07, "loss": -0.0, "num_tokens": 3246013.0, "reward": 0.17620250582695007, "reward_std": 0.6548349857330322, "rewards/cosine_scaled_reward/mean": -0.08377375453710556, "rewards/cosine_scaled_reward/std": 0.3527655303478241, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4787135720252991, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 1851.015625, "completions/mean_terminated_length": 1147.5, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.030857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.2730060815811157, "learning_rate": 9.975348529157229e-07, "loss": -0.0, "num_tokens": 3374766.0, "reward": -0.18854813277721405, "reward_std": 0.49348777532577515, "rewards/cosine_scaled_reward/mean": -0.21146157383918762, "rewards/cosine_scaled_reward/std": 0.2601618766784668, "rewards/format_reward/mean": 0.234375, "rewards/format_reward/std": 0.42695629596710205, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1798.328125, "completions/mean_terminated_length": 1049.3125, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 0.2566036880016327, "learning_rate": 9.96645768238595e-07, "loss": 0.0, "num_tokens": 3500195.0, "reward": 0.06705980002880096, "reward_std": 0.7090284824371338, "rewards/cosine_scaled_reward/mean": -0.10709509253501892, "rewards/cosine_scaled_reward/std": 0.4101051986217499, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4531635046005249, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1930.203125, "completions/mean_terminated_length": 1210.3333740234375, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.03314285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.25197461247444153, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "num_tokens": 3634200.0, "reward": -0.2462695688009262, "reward_std": 0.5237302780151367, "rewards/cosine_scaled_reward/mean": -0.2012597918510437, "rewards/cosine_scaled_reward/std": 0.23252712190151215, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.36596253514289856, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.796875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 1847.65625, "completions/mean_terminated_length": 1061.6923828125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.03428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.30431485176086426, "learning_rate": 9.944597532678119e-07, "loss": 0.0, "num_tokens": 3762986.0, "reward": -0.05392302945256233, "reward_std": 0.7249555587768555, "rewards/cosine_scaled_reward/mean": -0.15196150541305542, "rewards/cosine_scaled_reward/std": 0.34566983580589294, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 1838.671875, "completions/mean_terminated_length": 931.5833740234375, "completions/min_length": 399.0, "completions/min_terminated_length": 399.0, "epoch": 0.03542857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2484513372182846, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "num_tokens": 3891157.0, "reward": -0.11271396279335022, "reward_std": 0.6705260872840881, "rewards/cosine_scaled_reward/mean": -0.1813569962978363, "rewards/cosine_scaled_reward/std": 0.4071698486804962, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.78125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 1910.109375, "completions/mean_terminated_length": 1417.6429443359375, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.036571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.25329527258872986, "learning_rate": 9.917322325514487e-07, "loss": -0.0, "num_tokens": 4023756.0, "reward": -0.08931556344032288, "reward_std": 0.6381070613861084, "rewards/cosine_scaled_reward/mean": -0.16965776681900024, "rewards/cosine_scaled_reward/std": 0.37385129928588867, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 2023.71875, "completions/mean_terminated_length": 1530.0, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "epoch": 0.037714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.22758109867572784, "learning_rate": 9.901664203302124e-07, "loss": 0.0, "num_tokens": 4164490.0, "reward": -0.4589868187904358, "reward_std": 0.5177067518234253, "rewards/cosine_scaled_reward/mean": -0.2919934093952179, "rewards/cosine_scaled_reward/std": 0.2252870500087738, "rewards/format_reward/mean": 0.125, "rewards/format_reward/std": 0.3333333432674408, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 1454.78125, "completions/mean_terminated_length": 963.2571411132812, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.038857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.3234354257583618, "learning_rate": 9.88466529153356e-07, "loss": 0.0, "num_tokens": 4267148.0, "reward": 0.656031608581543, "reward_std": 0.7529654502868652, "rewards/cosine_scaled_reward/mean": 0.05457830801606178, "rewards/cosine_scaled_reward/std": 0.49684229493141174, "rewards/format_reward/mean": 0.546875, "rewards/format_reward/std": 0.501733124256134, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.828125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 1819.078125, "completions/mean_terminated_length": 716.0909423828125, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.2821458876132965, "learning_rate": 9.866330768241983e-07, "loss": -0.0, "num_tokens": 4395065.0, "reward": -0.09630556404590607, "reward_std": 0.7089139223098755, "rewards/cosine_scaled_reward/mean": -0.15752778947353363, "rewards/cosine_scaled_reward/std": 0.3647947609424591, "rewards/format_reward/mean": 0.21875, "rewards/format_reward/std": 0.4166666865348816, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 1954.34375, "completions/mean_terminated_length": 1382.0, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.04114285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.24163897335529327, "learning_rate": 9.846666218300807e-07, "loss": -0.0, "num_tokens": 4531255.0, "reward": -0.34593287110328674, "reward_std": 0.44493502378463745, "rewards/cosine_scaled_reward/mean": -0.24327893555164337, "rewards/cosine_scaled_reward/std": 0.24784433841705322, "rewards/format_reward/mean": 0.140625, "rewards/format_reward/std": 0.3503824472427368, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.8125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 1868.921875, "completions/mean_terminated_length": 1092.916748046875, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.04228571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.24795544147491455, "learning_rate": 9.825677631722435e-07, "loss": -0.0, "num_tokens": 4661890.0, "reward": -0.23053905367851257, "reward_std": 0.34036368131637573, "rewards/cosine_scaled_reward/mean": -0.2246445268392563, "rewards/cosine_scaled_reward/std": 0.15942412614822388, "rewards/format_reward/mean": 0.21875, "rewards/format_reward/std": 0.4166666865348816, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.84375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 1889.53125, "completions/mean_terminated_length": 1033.800048828125, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.04342857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24283826351165771, "learning_rate": 9.80337140183366e-07, "loss": 0.0, "num_tokens": 4794532.0, "reward": -0.10043507814407349, "reward_std": 0.47925832867622375, "rewards/cosine_scaled_reward/mean": -0.13615503907203674, "rewards/cosine_scaled_reward/std": 0.3336707651615143, "rewards/format_reward/mean": 0.171875, "rewards/format_reward/std": 0.38025420904159546, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 1644.828125, "completions/mean_terminated_length": 689.9473876953125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.044571428571428574, "frac_reward_zero_std": 0.0, "grad_norm": 0.28362998366355896, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "num_tokens": 4910585.0, "reward": 0.12284853309392929, "reward_std": 0.4183085858821869, "rewards/cosine_scaled_reward/mean": -0.11045074462890625, "rewards/cosine_scaled_reward/std": 0.30217844247817993, "rewards/format_reward/mean": 0.34375, "rewards/format_reward/std": 0.4787135720252991, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1632.0, "completions/mean_length": 1618.28125, "completions/mean_terminated_length": 902.0833740234375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.045714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.262617826461792, "learning_rate": 9.754833590196926e-07, "loss": 0.0, "num_tokens": 5024227.0, "reward": 0.2076582908630371, "reward_std": 0.42125773429870605, "rewards/cosine_scaled_reward/mean": -0.12273336946964264, "rewards/cosine_scaled_reward/std": 0.4404613971710205, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.501733124256134, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.59375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 1717.734375, "completions/mean_terminated_length": 1235.0384521484375, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "epoch": 0.046857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.23294499516487122, "learning_rate": 9.728616793536587e-07, "loss": -0.0, "num_tokens": 5145314.0, "reward": 0.011502981185913086, "reward_std": 0.6816084980964661, "rewards/cosine_scaled_reward/mean": -0.22081100940704346, "rewards/cosine_scaled_reward/std": 0.37589573860168457, "rewards/format_reward/mean": 0.453125, "rewards/format_reward/std": 0.501733124256134, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 1703.921875, "completions/mean_terminated_length": 579.933349609375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.34672290086746216, "learning_rate": 9.701111919237408e-07, "loss": -0.0, "num_tokens": 5264725.0, "reward": -0.2616002857685089, "reward_std": 0.37952175736427307, "rewards/cosine_scaled_reward/mean": -0.26361262798309326, "rewards/cosine_scaled_reward/std": 0.17531204223632812, "rewards/format_reward/mean": 0.265625, "rewards/format_reward/std": 0.44515693187713623, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.703125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 1681.84375, "completions/mean_terminated_length": 814.631591796875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.04914285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.263967901468277, "learning_rate": 9.672327345550543e-07, "loss": -0.0, "num_tokens": 5383979.0, "reward": 0.13376155495643616, "reward_std": 0.46012288331985474, "rewards/cosine_scaled_reward/mean": -0.08155670762062073, "rewards/cosine_scaled_reward/std": 0.3612325191497803, "rewards/format_reward/mean": 0.296875, "rewards/format_reward/std": 0.4604927599430084, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.640625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1624.625, "completions/mean_terminated_length": 869.9130859375, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.05028571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.28927963972091675, "learning_rate": 9.64227184053598e-07, "loss": -0.0, "num_tokens": 5498651.0, "reward": 0.20869271457195282, "reward_std": 0.5558150410652161, "rewards/cosine_scaled_reward/mean": -0.0987786278128624, "rewards/cosine_scaled_reward/std": 0.42912590503692627, "rewards/format_reward/mean": 0.40625, "rewards/format_reward/std": 0.49501484632492065, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 2006.96875, "completions/mean_terminated_length": 1522.800048828125, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 0.05142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.24254000186920166, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "num_tokens": 5638753.0, "reward": -0.2540697157382965, "reward_std": 0.4600578844547272, "rewards/cosine_scaled_reward/mean": -0.20515984296798706, "rewards/cosine_scaled_reward/std": 0.3251590430736542, "rewards/format_reward/mean": 0.15625, "rewards/format_reward/std": 0.36596253514289856, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 1765.984375, "completions/mean_terminated_length": 919.9375, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.052571428571428575, "frac_reward_zero_std": 0.0, "grad_norm": 0.2645930349826813, "learning_rate": 9.578385041664925e-07, "loss": 0.0, "num_tokens": 5762944.0, "reward": -0.213707834482193, "reward_std": 0.38778313994407654, "rewards/cosine_scaled_reward/mean": -0.2318539321422577, "rewards/cosine_scaled_reward/std": 0.21436986327171326, "rewards/format_reward/mean": 0.25, "rewards/format_reward/std": 0.4364357888698578, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1583.40625, "completions/mean_terminated_length": 986.0714721679688, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.053714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.311797559261322, "learning_rate": 9.54457320834625e-07, "loss": 0.0, "num_tokens": 5874682.0, "reward": 0.27925533056259155, "reward_std": 0.6467443704605103, "rewards/cosine_scaled_reward/mean": -0.07912233471870422, "rewards/cosine_scaled_reward/std": 0.4737093150615692, "rewards/format_reward/mean": 0.4375, "rewards/format_reward/std": 0.5, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 1690.0625, "completions/mean_terminated_length": 1006.727294921875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.054857142857142854, "frac_reward_zero_std": 0.0, "grad_norm": 0.26644304394721985, "learning_rate": 9.509529358847654e-07, "loss": -0.0, "num_tokens": 5993390.0, "reward": 0.13692031800746918, "reward_std": 0.5655145049095154, "rewards/cosine_scaled_reward/mean": -0.12685233354568481, "rewards/cosine_scaled_reward/std": 0.32320985198020935, "rewards/format_reward/mean": 0.390625, "rewards/format_reward/std": 0.4917473793029785, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1387.140625, "completions/mean_terminated_length": 804.0294189453125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.3078882396221161, "learning_rate": 9.473264167865171e-07, "loss": 0.0, "num_tokens": 6092231.0, "reward": 0.35559189319610596, "reward_std": 0.5927403569221497, "rewards/cosine_scaled_reward/mean": -0.09564155340194702, "rewards/cosine_scaled_reward/std": 0.4046906530857086, "rewards/format_reward/mean": 0.546875, "rewards/format_reward/std": 0.501733124256134, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.65625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1674.890625, "completions/mean_terminated_length": 962.5909423828125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.05714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.23925544321537018, "learning_rate": 9.43578868212728e-07, "loss": -0.0, "num_tokens": 6210240.0, "reward": 0.18573230504989624, "reward_std": 0.5264967083930969, "rewards/cosine_scaled_reward/mean": -0.09463384002447128, "rewards/cosine_scaled_reward/std": 0.4100942015647888, "rewards/format_reward/mean": 0.375, "rewards/format_reward/std": 0.48795005679130554, "step": 50 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 6210240, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }