{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 1840.125, "completions/mean_terminated_length": 1493.666748046875, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.6628252342045541, "kl": 0.0004730224609375, "learning_rate": 0.0, "loss": 0.0828, "num_tokens": 15585.0, "reward": 0.53125, "reward_std": 0.41052013635635376, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.12938730418682098, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1674.0, "completions/mean_length": 1917.875, "completions/mean_terminated_length": 1527.5, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "epoch": 0.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.7317782492587136, "kl": 0.00067138671875, "learning_rate": 1e-07, "loss": 0.069, "num_tokens": 32088.0, "reward": 0.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.13363061845302582, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1641.0, "completions/mean_length": 1356.875, "completions/mean_terminated_length": 1126.5, "completions/min_length": 625.0, "completions/min_terminated_length": 625.0, "epoch": 0.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.9091411567234831, "kl": 0.0005397796630859375, "learning_rate": 2e-07, "loss": 0.039, "num_tokens": 44431.0, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.7902192806756456, "kl": 0.000720977783203125, "learning_rate": 3e-07, "loss": 0.0, "num_tokens": 62215.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 1989.5, "completions/mean_terminated_length": 1580.0, "completions/min_length": 1580.0, "completions/min_terminated_length": 1580.0, "epoch": 0.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.4606638644327677, "kl": 0.00045490264892578125, "learning_rate": 4e-07, "loss": 0.0382, "num_tokens": 79059.0, "reward": 0.5, "reward_std": 0.4225771427154541, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.13363061845302582, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.06, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036145388208231963, "kl": 0.0007953643798828125, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 96667.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1741.0, "completions/mean_length": 1504.875, "completions/mean_terminated_length": 1427.2857666015625, "completions/min_length": 1116.0, "completions/min_terminated_length": 1116.0, "epoch": 0.07, "frac_reward_zero_std": 1.0, "grad_norm": 0.003359971830136702, "kl": 0.000370025634765625, "learning_rate": 6e-07, "loss": 0.0, "num_tokens": 110562.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 833.625, "completions/mean_terminated_length": 833.625, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.8406033348743903, "kl": 0.000423431396484375, "learning_rate": 7e-07, "loss": 0.0395, "num_tokens": 118143.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5625, "rewards/tag_count_reward/std": 0.1767766922712326, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 1935.375, "completions/mean_terminated_length": 1597.5, "completions/min_length": 1312.0, "completions/min_terminated_length": 1312.0, "epoch": 0.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.6507369858144503, "kl": 0.0004940032958984375, "learning_rate": 8e-07, "loss": 0.0704, "num_tokens": 134458.0, "reward": 0.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 1947.0, "completions/mean_terminated_length": 1644.0, "completions/min_length": 1594.0, "completions/min_terminated_length": 1594.0, "epoch": 0.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.9403904262794942, "kl": 0.000942230224609375, "learning_rate": 9e-07, "loss": 0.0548, "num_tokens": 151226.0, "reward": 0.375, "reward_std": 0.26726123690605164, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.26726123690605164, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.11, "frac_reward_zero_std": 1.0, "grad_norm": 0.003568944942064051, "kl": 0.0007343292236328125, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 168538.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 745.375, "completions/mean_terminated_length": 745.375, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 0.12, "frac_reward_zero_std": 1.0, "grad_norm": 0.006363067666064702, "kl": 0.00060272216796875, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "num_tokens": 175093.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.13, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029190106524072316, "kl": 0.0005397796630859375, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "num_tokens": 192629.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 1707.875, "completions/mean_terminated_length": 1594.5, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "epoch": 0.14, "frac_reward_zero_std": 0.0, "grad_norm": 0.6809709230610892, "kl": 0.00041961669921875, "learning_rate": 9.975348529157229e-07, "loss": 0.121, "num_tokens": 207108.0, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.15, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028157580161607284, "kl": 0.00044918060302734375, "learning_rate": 9.956206309337066e-07, "loss": 0.0, "num_tokens": 224476.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 1521.0, "completions/mean_terminated_length": 1345.3333740234375, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 0.16, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032241372357074316, "kl": 0.0003681182861328125, "learning_rate": 9.931634888554935e-07, "loss": 0.0, "num_tokens": 237716.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 2038.875, "completions/mean_terminated_length": 1975.0, "completions/min_length": 1975.0, "completions/min_terminated_length": 1975.0, "epoch": 0.17, "frac_reward_zero_std": 0.0, "grad_norm": 0.5238026110591891, "kl": 0.00043487548828125, "learning_rate": 9.901664203302124e-07, "loss": 0.0064, "num_tokens": 254851.0, "reward": 0.34375, "reward_std": 0.2651650309562683, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.2651650309562683, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 1472.875, "completions/mean_terminated_length": 1390.71435546875, "completions/min_length": 948.0, "completions/min_terminated_length": 948.0, "epoch": 0.18, "frac_reward_zero_std": 0.0, "grad_norm": 0.6969142375215345, "kl": 0.000553131103515625, "learning_rate": 9.866330768241983e-07, "loss": 0.0285, "num_tokens": 267802.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5625, "rewards/tag_count_reward/std": 0.1767766922712326, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.19, "frac_reward_zero_std": 1.0, "grad_norm": 0.006170385130050486, "kl": 0.00032806396484375, "learning_rate": 9.825677631722435e-07, "loss": 0.0, "num_tokens": 285002.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 1872.875, "completions/mean_terminated_length": 1697.75, "completions/min_length": 1359.0, "completions/min_terminated_length": 1359.0, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.5924705799517846, "kl": 0.000545501708984375, "learning_rate": 9.779754323328192e-07, "loss": 0.0592, "num_tokens": 301433.0, "reward": 0.78125, "reward_std": 0.60411536693573, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.12938730418682098, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 718.75, "completions/mean_terminated_length": 718.75, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.21, "frac_reward_zero_std": 0.0, "grad_norm": 1.076208309541439, "kl": 0.000446319580078125, "learning_rate": 9.728616793536587e-07, "loss": -0.0477, "num_tokens": 308647.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 751.0, "completions/mean_terminated_length": 751.0, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.22, "frac_reward_zero_std": 0.0, "grad_norm": 1.6075613594297673, "kl": 0.0004673004150390625, "learning_rate": 9.672327345550543e-07, "loss": -0.2154, "num_tokens": 315719.0, "reward": 0.6875, "reward_std": 0.5303300619125366, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5625, "rewards/tag_count_reward/std": 0.1767766922712326, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 1307.125, "completions/mean_terminated_length": 862.6000366210938, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.8767085399237937, "kl": 0.0005292892456054688, "learning_rate": 9.610954559391704e-07, "loss": 0.2683, "num_tokens": 327448.0, "reward": 0.96875, "reward_std": 0.5737953186035156, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24, "frac_reward_zero_std": 0.0, "grad_norm": 0.6467284427973272, "kl": 0.0005121231079101562, "learning_rate": 9.54457320834625e-07, "loss": 0.0, "num_tokens": 344728.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1745.0, "completions/mean_terminated_length": 1442.0, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.6364191810172123, "kl": 0.00034999847412109375, "learning_rate": 9.473264167865171e-07, "loss": 0.0907, "num_tokens": 359472.0, "reward": 0.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.13363061845302582, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 860.125, "completions/mean_terminated_length": 860.125, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.26, "frac_reward_zero_std": 0.0, "grad_norm": 0.9557565788184962, "kl": 0.0004291534423828125, "learning_rate": 9.397114317029974e-07, "loss": -0.0065, "num_tokens": 367057.0, "reward": 0.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27, "frac_reward_zero_std": 1.0, "grad_norm": 0.003114398061108756, "kl": 0.0005168914794921875, "learning_rate": 9.316216432703916e-07, "loss": 0.0, "num_tokens": 384721.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1633.625, "completions/mean_terminated_length": 1495.5, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "epoch": 0.28, "frac_reward_zero_std": 0.0, "grad_norm": 0.8883130168762549, "kl": 0.000690460205078125, "learning_rate": 9.230669076497687e-07, "loss": 0.0, "num_tokens": 398990.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1646.0, "completions/mean_length": 1305.875, "completions/mean_terminated_length": 1058.5, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.29, "frac_reward_zero_std": 0.0, "grad_norm": 0.8508879198267415, "kl": 0.00045108795166015625, "learning_rate": 9.140576474687263e-07, "loss": 0.2107, "num_tokens": 410989.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 319.875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.3, "frac_reward_zero_std": 1.0, "grad_norm": 0.009396867221837423, "kl": 0.000469207763671875, "learning_rate": 9.046048391230247e-07, "loss": 0.0, "num_tokens": 414372.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 1646.625, "completions/mean_terminated_length": 1512.8333740234375, "completions/min_length": 1172.0, "completions/min_terminated_length": 1172.0, "epoch": 0.31, "frac_reward_zero_std": 0.0, "grad_norm": 0.677636559478867, "kl": 0.0004253387451171875, "learning_rate": 8.9471999940354e-07, "loss": 0.0789, "num_tokens": 429153.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32, "frac_reward_zero_std": 0.0, "grad_norm": 0.42518375588544877, "kl": 0.000492095947265625, "learning_rate": 8.844151714648274e-07, "loss": 0.0, "num_tokens": 446505.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 1977.625, "completions/mean_terminated_length": 1485.0, "completions/min_length": 1485.0, "completions/min_terminated_length": 1485.0, "epoch": 0.33, "frac_reward_zero_std": 0.0, "grad_norm": 0.7573312728335488, "kl": 0.00067901611328125, "learning_rate": 8.737029101523929e-07, "loss": 0.0563, "num_tokens": 463614.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031934478388464358, "kl": 0.000537872314453125, "learning_rate": 8.625962667065487e-07, "loss": 0.0, "num_tokens": 481382.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.6214744310551946, "kl": 0.00048160552978515625, "learning_rate": 8.511087728614862e-07, "loss": 0.0, "num_tokens": 498622.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1532.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 792.625, "completions/mean_terminated_length": 792.625, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "epoch": 0.36, "frac_reward_zero_std": 0.0, "grad_norm": 0.8848488126874285, "kl": 0.0005741119384765625, "learning_rate": 8.392544243589427e-07, "loss": -0.0344, "num_tokens": 506091.0, "reward": 0.9375, "reward_std": 0.47715675830841064, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6875, "rewards/tag_count_reward/std": 0.22160132229328156, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037357667583392553, "kl": 0.000514984130859375, "learning_rate": 8.270476638965461e-07, "loss": 0.0, "num_tokens": 523475.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 1464.875, "completions/mean_terminated_length": 1115.0, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 0.38, "frac_reward_zero_std": 0.0, "grad_norm": 0.9255650045041091, "kl": 0.0004863739013671875, "learning_rate": 8.145033635316128e-07, "loss": 0.242, "num_tokens": 536562.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 1548.25, "completions/mean_terminated_length": 1476.857177734375, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 0.39, "frac_reward_zero_std": 0.0, "grad_norm": 0.7119712141872488, "kl": 0.0004978179931640625, "learning_rate": 8.01636806561836e-07, "loss": 0.0712, "num_tokens": 550620.0, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037004750751766843, "kl": 0.000919342041015625, "learning_rate": 7.884636689049422e-07, "loss": 0.0, "num_tokens": 568516.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1684.25, "completions/mean_terminated_length": 1320.5, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "epoch": 0.41, "frac_reward_zero_std": 0.0, "grad_norm": 0.6765909685421715, "kl": 0.0006198883056640625, "learning_rate": 7.75e-07, "loss": 0.2158, "num_tokens": 582934.0, "reward": 0.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.13363061845302582, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 1946.125, "completions/mean_terminated_length": 1233.0, "completions/min_length": 1233.0, "completions/min_terminated_length": 1233.0, "epoch": 0.42, "frac_reward_zero_std": 0.0, "grad_norm": 0.6782057995689209, "kl": 0.000667572021484375, "learning_rate": 7.612622032536507e-07, "loss": 0.0878, "num_tokens": 600151.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43, "frac_reward_zero_std": 1.0, "grad_norm": 0.003069913511462463, "kl": 0.0005235671997070312, "learning_rate": 7.472670160550848e-07, "loss": 0.0, "num_tokens": 617599.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 2034.625, "completions/mean_terminated_length": 1941.0, "completions/min_length": 1941.0, "completions/min_terminated_length": 1941.0, "epoch": 0.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.47430574211354576, "kl": 0.00039577484130859375, "learning_rate": 7.330314893841101e-07, "loss": 0.0073, "num_tokens": 634780.0, "reward": 0.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 1884.75, "completions/mean_terminated_length": 1830.3333740234375, "completions/min_length": 1705.0, "completions/min_terminated_length": 1705.0, "epoch": 0.45, "frac_reward_zero_std": 0.0, "grad_norm": 0.7613443731007656, "kl": 0.0006561279296875, "learning_rate": 7.185729670371604e-07, "loss": 0.0207, "num_tokens": 652242.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 1147.75, "completions/mean_terminated_length": 847.6666870117188, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.46, "frac_reward_zero_std": 0.0, "grad_norm": 1.0778276077011943, "kl": 0.00066375732421875, "learning_rate": 7.039090644965509e-07, "loss": 0.0154, "num_tokens": 662560.0, "reward": 0.8125, "reward_std": 0.5786375403404236, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 736.25, "completions/mean_terminated_length": 736.25, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.47, "frac_reward_zero_std": 0.0, "grad_norm": 1.7259986577953768, "kl": 0.00115966796875, "learning_rate": 6.890576474687263e-07, "loss": -0.0495, "num_tokens": 669274.0, "reward": 0.9375, "reward_std": 0.4172614812850952, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6875, "rewards/tag_count_reward/std": 0.25877460837364197, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 1060.125, "completions/mean_terminated_length": 1060.125, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 0.48, "frac_reward_zero_std": 0.0, "grad_norm": 0.6442195673577423, "kl": 0.0004329681396484375, "learning_rate": 6.740368101176495e-07, "loss": 0.0696, "num_tokens": 678611.0, "reward": 0.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5625, "rewards/tag_count_reward/std": 0.1767766922712326, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49, "frac_reward_zero_std": 1.0, "grad_norm": 0.006123254776481617, "kl": 0.00051116943359375, "learning_rate": 6.588648530198504e-07, "loss": 0.0, "num_tokens": 696635.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 1975.25, "completions/mean_terminated_length": 1466.0, "completions/min_length": 1466.0, "completions/min_terminated_length": 1466.0, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.6336529820312526, "kl": 0.0006256103515625, "learning_rate": 6.435602608679916e-07, "loss": 0.0585, "num_tokens": 713709.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 1899.0, "completions/mean_terminated_length": 856.0, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 0.51, "frac_reward_zero_std": 0.0, "grad_norm": 0.49637328343233095, "kl": 0.0006580352783203125, "learning_rate": 6.281416799501187e-07, "loss": 0.1451, "num_tokens": 729901.0, "reward": 0.40625, "reward_std": 0.4419417381286621, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 2043.625, "completions/mean_terminated_length": 2013.0, "completions/min_length": 2013.0, "completions/min_terminated_length": 2013.0, "epoch": 0.52, "frac_reward_zero_std": 0.0, "grad_norm": 0.7195590756820243, "kl": 0.00067138671875, "learning_rate": 6.126278954320294e-07, "loss": 0.0031, "num_tokens": 747090.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 565.5, "completions/mean_terminated_length": 565.5, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.53, "frac_reward_zero_std": 1.0, "grad_norm": 0.031718782447743454, "kl": 0.0008029937744140625, "learning_rate": 5.97037808470444e-07, "loss": 0.0, "num_tokens": 752910.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 1898.375, "completions/mean_terminated_length": 851.0, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.54, "frac_reward_zero_std": 0.0, "grad_norm": 0.6988079348841579, "kl": 0.0009441375732421875, "learning_rate": 5.813904131848564e-07, "loss": 0.1114, "num_tokens": 769249.0, "reward": 0.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1715.875, "completions/mean_terminated_length": 1383.75, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.55, "frac_reward_zero_std": 0.0, "grad_norm": 0.7609580950694678, "kl": 0.0007781982421875, "learning_rate": 5.657047735161255e-07, "loss": 0.1302, "num_tokens": 783920.0, "reward": 0.90625, "reward_std": 0.6399986147880554, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.12938730418682098, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 829.625, "completions/mean_terminated_length": 655.5714721679688, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.56, "frac_reward_zero_std": 0.0, "grad_norm": 1.1927365377038142, "kl": 0.001590728759765625, "learning_rate": 5.5e-07, "loss": 0.0939, "num_tokens": 791709.0, "reward": 0.53125, "reward_std": 0.2086307406425476, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.2086307406425476, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1331.0, "completions/mean_length": 994.5, "completions/mean_terminated_length": 844.0000610351562, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.57, "frac_reward_zero_std": 0.0, "grad_norm": 1.182121142240305, "kl": 0.0005893707275390625, "learning_rate": 5.342952264838747e-07, "loss": 0.0, "num_tokens": 801057.0, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.625, "rewards/tag_count_reward/std": 0.2314550280570984, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 834.25, "completions/mean_terminated_length": 834.25, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.8842736658535645, "kl": 0.0005855560302734375, "learning_rate": 5.186095868151436e-07, "loss": 0.0177, "num_tokens": 809131.0, "reward": 0.875, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 580.25, "completions/mean_terminated_length": 580.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.59, "frac_reward_zero_std": 0.0, "grad_norm": 0.947212959421752, "kl": 0.000576019287109375, "learning_rate": 5.02962191529556e-07, "loss": 0.1389, "num_tokens": 814829.0, "reward": 0.9375, "reward_std": 0.45806270837783813, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6875, "rewards/tag_count_reward/std": 0.1767766922712326, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 1823.875, "completions/mean_terminated_length": 1450.3333740234375, "completions/min_length": 1205.0, "completions/min_terminated_length": 1205.0, "epoch": 0.6, "frac_reward_zero_std": 0.0, "grad_norm": 0.7296721993472337, "kl": 0.0008335113525390625, "learning_rate": 4.873721045679706e-07, "loss": 0.1096, "num_tokens": 831916.0, "reward": 0.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.13363061845302582, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1570.0, "completions/mean_terminated_length": 1410.666748046875, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 0.61, "frac_reward_zero_std": 0.0, "grad_norm": 0.6173214568936407, "kl": 0.000675201416015625, "learning_rate": 4.7185832004988133e-07, "loss": 0.0118, "num_tokens": 845372.0, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1746.5, "completions/mean_terminated_length": 1445.0, "completions/min_length": 1093.0, "completions/min_terminated_length": 1093.0, "epoch": 0.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.6996417102286492, "kl": 0.0004863739013671875, "learning_rate": 4.5643973913200837e-07, "loss": 0.0543, "num_tokens": 860464.0, "reward": 0.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.13363061845302582, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 1849.75, "completions/mean_terminated_length": 1651.5, "completions/min_length": 1406.0, "completions/min_terminated_length": 1406.0, "epoch": 0.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.5518151553173074, "kl": 0.00035190582275390625, "learning_rate": 4.4113514698014953e-07, "loss": 0.0645, "num_tokens": 876246.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1935.0, "completions/mean_length": 1869.0, "completions/mean_terminated_length": 1332.0, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 0.64, "frac_reward_zero_std": 0.0, "grad_norm": 0.7947269366057148, "kl": 0.00045108795166015625, "learning_rate": 4.2596318988235037e-07, "loss": 0.1176, "num_tokens": 892662.0, "reward": 0.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.13363061845302582, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 1797.0, "completions/mean_terminated_length": 1378.666748046875, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 0.65, "frac_reward_zero_std": 0.0, "grad_norm": 0.7873341627366034, "kl": 0.001064300537109375, "learning_rate": 4.1094235253127374e-07, "loss": 0.1647, "num_tokens": 908198.0, "reward": 0.34375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.12938730418682098, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 1967.125, "completions/mean_terminated_length": 1401.0, "completions/min_length": 1401.0, "completions/min_terminated_length": 1401.0, "epoch": 0.66, "frac_reward_zero_std": 0.0, "grad_norm": 0.6108272802669615, "kl": 0.0005245208740234375, "learning_rate": 3.9609093550344907e-07, "loss": 0.0663, "num_tokens": 924743.0, "reward": 0.40625, "reward_std": 0.4419417381286621, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 2007.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1388.0, "completions/mean_terminated_length": 1388.0, "completions/min_length": 1120.0, "completions/min_terminated_length": 1120.0, "epoch": 0.67, "frac_reward_zero_std": 0.0, "grad_norm": 0.6923989772553029, "kl": 0.0004787445068359375, "learning_rate": 3.8142703296283953e-07, "loss": 0.0752, "num_tokens": 937455.0, "reward": 0.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.625, "rewards/tag_count_reward/std": 0.2314550280570984, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.68, "frac_reward_zero_std": 0.0, "grad_norm": 0.5804438368543029, "kl": 0.0006103515625, "learning_rate": 3.6696851061588994e-07, "loss": 0.0, "num_tokens": 957231.0, "reward": 0.5625, "reward_std": 0.5786375403404236, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.69, "frac_reward_zero_std": 0.0, "grad_norm": 0.602172715608051, "kl": 0.0004863739013671875, "learning_rate": 3.5273298394491515e-07, "loss": 0.0, "num_tokens": 974775.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1173.0, "completions/mean_length": 1111.75, "completions/mean_terminated_length": 978.0000610351562, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 0.7, "frac_reward_zero_std": 0.0, "grad_norm": 0.5800710876181367, "kl": 0.0004787445068359375, "learning_rate": 3.387377967463493e-07, "loss": 0.0362, "num_tokens": 985317.0, "reward": 0.59375, "reward_std": 0.376485139131546, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1826.75, "completions/mean_terminated_length": 1605.5, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 0.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.8588425272525544, "kl": 0.001007080078125, "learning_rate": 3.250000000000001e-07, "loss": 0.025, "num_tokens": 1001051.0, "reward": 0.5, "reward_std": 0.4225771427154541, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.13363061845302582, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1592.0, "completions/mean_length": 1284.0, "completions/mean_terminated_length": 1174.857177734375, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 0.72, "frac_reward_zero_std": 1.0, "grad_norm": 0.013235371533423931, "kl": 0.0008678436279296875, "learning_rate": 3.115363310950578e-07, "loss": 0.0, "num_tokens": 1012123.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1916.25, "completions/mean_terminated_length": 1696.666748046875, "completions/min_length": 1519.0, "completions/min_terminated_length": 1519.0, "epoch": 0.73, "frac_reward_zero_std": 0.0, "grad_norm": 0.5474968619673198, "kl": 0.000751495361328125, "learning_rate": 2.9836319343816397e-07, "loss": 0.0, "num_tokens": 1028421.0, "reward": 0.65625, "reward_std": 0.5334774851799011, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.12938730418682098, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1883.0, "completions/max_terminated_length": 1883.0, "completions/mean_length": 1288.75, "completions/mean_terminated_length": 1288.75, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 0.74, "frac_reward_zero_std": 1.0, "grad_norm": 0.003456095198800055, "kl": 0.0003814697265625, "learning_rate": 2.854966364683872e-07, "loss": 0.0, "num_tokens": 1039587.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1668.0, "completions/mean_length": 1781.0, "completions/mean_terminated_length": 1514.0, "completions/min_length": 1256.0, "completions/min_terminated_length": 1256.0, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 0.548319750098783, "kl": 0.00042057037353515625, "learning_rate": 2.729523361034538e-07, "loss": 0.0938, "num_tokens": 1054939.0, "reward": 0.9375, "reward_std": 0.6087164282798767, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.76, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028496725166757297, "kl": 0.00051116943359375, "learning_rate": 2.6074557564105724e-07, "loss": 0.0, "num_tokens": 1072235.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 1953.375, "completions/mean_terminated_length": 1669.5, "completions/min_length": 1396.0, "completions/min_terminated_length": 1396.0, "epoch": 0.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.706431481153136, "kl": 0.000545501708984375, "learning_rate": 2.488912271385139e-07, "loss": 0.0582, "num_tokens": 1089334.0, "reward": 0.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 1863.25, "completions/mean_terminated_length": 1555.3333740234375, "completions/min_length": 1322.0, "completions/min_terminated_length": 1322.0, "epoch": 0.78, "frac_reward_zero_std": 0.0, "grad_norm": 0.7026848252787423, "kl": 0.0008983612060546875, "learning_rate": 2.374037332934512e-07, "loss": 0.1001, "num_tokens": 1105592.0, "reward": 0.40625, "reward_std": 0.2651650309562683, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.2651650309562683, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 1021.625, "completions/mean_terminated_length": 1021.625, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 0.79, "frac_reward_zero_std": 1.0, "grad_norm": 0.004479904510744182, "kl": 0.0004482269287109375, "learning_rate": 2.2629708984760706e-07, "loss": 0.0, "num_tokens": 1114469.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8, "frac_reward_zero_std": 1.0, "grad_norm": 0.012713748507087811, "kl": 0.000774383544921875, "learning_rate": 2.1558482853517253e-07, "loss": 0.0, "num_tokens": 1132061.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.81, "frac_reward_zero_std": 1.0, "grad_norm": 0.009237916379458392, "kl": 0.00116729736328125, "learning_rate": 2.0528000059645995e-07, "loss": 0.0, "num_tokens": 1149405.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.82, "frac_reward_zero_std": 0.0, "grad_norm": 0.731282287844213, "kl": 0.00102996826171875, "learning_rate": 1.9539516087697517e-07, "loss": 0.0, "num_tokens": 1167485.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1497.0, "completions/mean_length": 1806.375, "completions/mean_terminated_length": 1403.666748046875, "completions/min_length": 1279.0, "completions/min_terminated_length": 1279.0, "epoch": 0.83, "frac_reward_zero_std": 0.0, "grad_norm": 0.6696391690257025, "kl": 0.0007190704345703125, "learning_rate": 1.8594235253127372e-07, "loss": 0.1358, "num_tokens": 1182888.0, "reward": 0.34375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.12938730418682098, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 1856.375, "completions/mean_terminated_length": 1537.0, "completions/min_length": 1284.0, "completions/min_terminated_length": 1284.0, "epoch": 0.84, "frac_reward_zero_std": 0.0, "grad_norm": 0.8520445078964867, "kl": 0.00098419189453125, "learning_rate": 1.7693309235023127e-07, "loss": 0.1106, "num_tokens": 1198539.0, "reward": 0.34375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.12938730418682098, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1619.0, "completions/mean_length": 1552.75, "completions/mean_terminated_length": 1387.666748046875, "completions/min_length": 1177.0, "completions/min_terminated_length": 1177.0, "epoch": 0.85, "frac_reward_zero_std": 0.0, "grad_norm": 0.6415780475709305, "kl": 0.00045013427734375, "learning_rate": 1.6837835672960831e-07, "loss": 0.1278, "num_tokens": 1211897.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1301.75, "completions/mean_terminated_length": 854.0, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.86, "frac_reward_zero_std": 0.0, "grad_norm": 1.2846817650924338, "kl": 0.0011138916015625, "learning_rate": 1.6028856829700258e-07, "loss": 0.3474, "num_tokens": 1223335.0, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 1210.625, "completions/mean_terminated_length": 1091.0, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "epoch": 0.87, "frac_reward_zero_std": 0.0, "grad_norm": 0.7215403346916941, "kl": 0.0008296966552734375, "learning_rate": 1.5267358321348285e-07, "loss": 0.0492, "num_tokens": 1233980.0, "reward": 0.53125, "reward_std": 0.2086307406425476, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.2086307406425476, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 1143.375, "completions/mean_terminated_length": 841.8333740234375, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.888295640678355, "kl": 0.0009326934814453125, "learning_rate": 1.4554267916537495e-07, "loss": 0.203, "num_tokens": 1244423.0, "reward": 0.5625, "reward_std": 0.2912411689758301, "rewards/accuracy_reward/mean": NaN, "rewards/accuracy_reward/std": NaN, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5625, "rewards/tag_count_reward/std": 0.29124119877815247, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1903.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 1234.0, "completions/mean_terminated_length": 1234.0, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 0.89, "frac_reward_zero_std": 0.0, "grad_norm": 0.4757099764606759, "kl": 0.000911712646484375, "learning_rate": 1.3890454406082956e-07, "loss": 0.0644, "num_tokens": 1255343.0, "reward": 0.53125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 2009.625, "completions/mean_terminated_length": 1894.5, "completions/min_length": 1821.0, "completions/min_terminated_length": 1821.0, "epoch": 0.9, "frac_reward_zero_std": 0.0, "grad_norm": 0.7840957699122901, "kl": 0.00072479248046875, "learning_rate": 1.3276726544494571e-07, "loss": 0.0212, "num_tokens": 1272652.0, "reward": 0.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.1157275140285492, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1858.25, "completions/mean_terminated_length": 1542.0, "completions/min_length": 1311.0, "completions/min_terminated_length": 1311.0, "epoch": 0.91, "frac_reward_zero_std": 0.0, "grad_norm": 0.8165269333115026, "kl": 0.00063323974609375, "learning_rate": 1.2713832064634125e-07, "loss": 0.0456, "num_tokens": 1289190.0, "reward": 0.46875, "reward_std": 0.4317220449447632, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.12938730418682098, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.6822113621589488, "kl": 0.000732421875, "learning_rate": 1.220245676671809e-07, "loss": 0.0, "num_tokens": 1307070.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 2048.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 1996.75, "completions/mean_terminated_length": 1638.0, "completions/min_length": 1638.0, "completions/min_terminated_length": 1638.0, "epoch": 0.93, "frac_reward_zero_std": 0.0, "grad_norm": 0.5735308409410357, "kl": 0.000598907470703125, "learning_rate": 1.1743223682775649e-07, "loss": 0.0, "num_tokens": 1324564.0, "reward": 0.34375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.12938730418682098, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.94, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035277658572437772, "kl": 0.00074005126953125, "learning_rate": 1.1336692317580158e-07, "loss": 0.0, "num_tokens": 1342244.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.95, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033309614626830077, "kl": 0.0007114410400390625, "learning_rate": 1.0983357966978745e-07, "loss": 0.0, "num_tokens": 1360532.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.96, "frac_reward_zero_std": 1.0, "grad_norm": 0.009616790202442917, "kl": 0.0008449554443359375, "learning_rate": 1.068365111445064e-07, "loss": 0.0, "num_tokens": 1377964.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 2048.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 1599.625, "completions/mean_terminated_length": 1450.166748046875, "completions/min_length": 1096.0, "completions/min_terminated_length": 1096.0, "epoch": 0.97, "frac_reward_zero_std": 0.0, "grad_norm": 0.6530906969360044, "kl": 0.0005645751953125, "learning_rate": 1.0437936906629334e-07, "loss": 0.0652, "num_tokens": 1391689.0, "reward": 0.84375, "reward_std": 0.5499594211578369, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.98, "frac_reward_zero_std": 1.0, "grad_norm": 0.01164341791205279, "kl": 0.0011444091796875, "learning_rate": 1.0246514708427701e-07, "loss": 0.0, "num_tokens": 1410297.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.25, "rewards/tag_count_reward/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.6801784484817373, "kl": 0.0009212493896484375, "learning_rate": 1.0109617738307911e-07, "loss": 0.0, "num_tokens": 1427513.0, "reward": 0.40625, "reward_std": 0.4419417381286621, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.28125, "rewards/tag_count_reward/std": 0.0883883461356163, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1748.625, "completions/mean_terminated_length": 1449.25, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.7939072364235943, "kl": 0.0008449554443359375, "learning_rate": 1.002741278414069e-07, "loss": 0.1071, "num_tokens": 1442638.0, "reward": 0.40625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40625, "rewards/tag_count_reward/std": 0.12938730418682098, "step": 100 }, { "epoch": 1.0, "step": 100, "total_flos": 0.0, "train_loss": 0.0464448650211034, "train_runtime": 1230.2242, "train_samples_per_second": 0.081, "train_steps_per_second": 0.081 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 1442638, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }