Files
DeepSeek-R1-Distill-Qwen-1.…/trainer_state.json
ModelHub XC d1d83abf72 初始化项目,由ModelHub XC社区提供模型
Model: yuerxin/DeepSeek-R1-Distill-Qwen-1.5B-GRPO
Source: Original Platform
2026-05-30 02:44:18 +08:00

3044 lines
106 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1627.0,
"completions/mean_length": 1840.125,
"completions/mean_terminated_length": 1493.666748046875,
"completions/min_length": 1381.0,
"completions/min_terminated_length": 1381.0,
"epoch": 0.01,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6628252342045541,
"kl": 0.0004730224609375,
"learning_rate": 0.0,
"loss": 0.0828,
"num_tokens": 15585.0,
"reward": 0.53125,
"reward_std": 0.41052013635635376,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3535533845424652,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.40625,
"rewards/tag_count_reward/std": 0.12938730418682098,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1674.0,
"completions/mean_length": 1917.875,
"completions/mean_terminated_length": 1527.5,
"completions/min_length": 1381.0,
"completions/min_terminated_length": 1381.0,
"epoch": 0.02,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7317782492587136,
"kl": 0.00067138671875,
"learning_rate": 1e-07,
"loss": 0.069,
"num_tokens": 32088.0,
"reward": 0.375,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.375,
"rewards/tag_count_reward/std": 0.13363061845302582,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1641.0,
"completions/mean_length": 1356.875,
"completions/mean_terminated_length": 1126.5,
"completions/min_length": 625.0,
"completions/min_terminated_length": 625.0,
"epoch": 0.03,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9091411567234831,
"kl": 0.0005397796630859375,
"learning_rate": 2e-07,
"loss": 0.039,
"num_tokens": 44431.0,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.46875,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.04,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7902192806756456,
"kl": 0.000720977783203125,
"learning_rate": 3e-07,
"loss": 0.0,
"num_tokens": 62215.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1580.0,
"completions/mean_length": 1989.5,
"completions/mean_terminated_length": 1580.0,
"completions/min_length": 1580.0,
"completions/min_terminated_length": 1580.0,
"epoch": 0.05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4606638644327677,
"kl": 0.00045490264892578125,
"learning_rate": 4e-07,
"loss": 0.0382,
"num_tokens": 79059.0,
"reward": 0.5,
"reward_std": 0.4225771427154541,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3535533845424652,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.375,
"rewards/tag_count_reward/std": 0.13363061845302582,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.06,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0036145388208231963,
"kl": 0.0007953643798828125,
"learning_rate": 5e-07,
"loss": 0.0,
"num_tokens": 96667.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1741.0,
"completions/mean_length": 1504.875,
"completions/mean_terminated_length": 1427.2857666015625,
"completions/min_length": 1116.0,
"completions/min_terminated_length": 1116.0,
"epoch": 0.07,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003359971830136702,
"kl": 0.000370025634765625,
"learning_rate": 6e-07,
"loss": 0.0,
"num_tokens": 110562.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1081.0,
"completions/max_terminated_length": 1081.0,
"completions/mean_length": 833.625,
"completions/mean_terminated_length": 833.625,
"completions/min_length": 510.0,
"completions/min_terminated_length": 510.0,
"epoch": 0.08,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8406033348743903,
"kl": 0.000423431396484375,
"learning_rate": 7e-07,
"loss": 0.0395,
"num_tokens": 118143.0,
"reward": 0.5625,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5625,
"rewards/tag_count_reward/std": 0.1767766922712326,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1883.0,
"completions/mean_length": 1935.375,
"completions/mean_terminated_length": 1597.5,
"completions/min_length": 1312.0,
"completions/min_terminated_length": 1312.0,
"epoch": 0.09,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6507369858144503,
"kl": 0.0004940032958984375,
"learning_rate": 8e-07,
"loss": 0.0704,
"num_tokens": 134458.0,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.3125,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1694.0,
"completions/mean_length": 1947.0,
"completions/mean_terminated_length": 1644.0,
"completions/min_length": 1594.0,
"completions/min_terminated_length": 1594.0,
"epoch": 0.1,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9403904262794942,
"kl": 0.000942230224609375,
"learning_rate": 9e-07,
"loss": 0.0548,
"num_tokens": 151226.0,
"reward": 0.375,
"reward_std": 0.26726123690605164,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.375,
"rewards/tag_count_reward/std": 0.26726123690605164,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.11,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003568944942064051,
"kl": 0.0007343292236328125,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 168538.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 819.0,
"completions/max_terminated_length": 819.0,
"completions/mean_length": 745.375,
"completions/mean_terminated_length": 745.375,
"completions/min_length": 604.0,
"completions/min_terminated_length": 604.0,
"epoch": 0.12,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006363067666064702,
"kl": 0.00060272216796875,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0,
"num_tokens": 175093.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.13,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0029190106524072316,
"kl": 0.0005397796630859375,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0,
"num_tokens": 192629.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1842.0,
"completions/mean_length": 1707.875,
"completions/mean_terminated_length": 1594.5,
"completions/min_length": 1003.0,
"completions/min_terminated_length": 1003.0,
"epoch": 0.14,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6809709230610892,
"kl": 0.00041961669921875,
"learning_rate": 9.975348529157229e-07,
"loss": 0.121,
"num_tokens": 207108.0,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.46875,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.15,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0028157580161607284,
"kl": 0.00044918060302734375,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0,
"num_tokens": 224476.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1748.0,
"completions/mean_length": 1521.0,
"completions/mean_terminated_length": 1345.3333740234375,
"completions/min_length": 944.0,
"completions/min_terminated_length": 944.0,
"epoch": 0.16,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0032241372357074316,
"kl": 0.0003681182861328125,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0,
"num_tokens": 237716.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1975.0,
"completions/mean_length": 2038.875,
"completions/mean_terminated_length": 1975.0,
"completions/min_length": 1975.0,
"completions/min_terminated_length": 1975.0,
"epoch": 0.17,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5238026110591891,
"kl": 0.00043487548828125,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0064,
"num_tokens": 254851.0,
"reward": 0.34375,
"reward_std": 0.2651650309562683,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.34375,
"rewards/tag_count_reward/std": 0.2651650309562683,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1946.0,
"completions/mean_length": 1472.875,
"completions/mean_terminated_length": 1390.71435546875,
"completions/min_length": 948.0,
"completions/min_terminated_length": 948.0,
"epoch": 0.18,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6969142375215345,
"kl": 0.000553131103515625,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0285,
"num_tokens": 267802.0,
"reward": 0.5625,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5625,
"rewards/tag_count_reward/std": 0.1767766922712326,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.19,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006170385130050486,
"kl": 0.00032806396484375,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0,
"num_tokens": 285002.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1876.0,
"completions/mean_length": 1872.875,
"completions/mean_terminated_length": 1697.75,
"completions/min_length": 1359.0,
"completions/min_terminated_length": 1359.0,
"epoch": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5924705799517846,
"kl": 0.000545501708984375,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0592,
"num_tokens": 301433.0,
"reward": 0.78125,
"reward_std": 0.60411536693573,
"rewards/accuracy_reward/mean": 0.375,
"rewards/accuracy_reward/std": 0.5175492167472839,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.40625,
"rewards/tag_count_reward/std": 0.12938730418682098,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1238.0,
"completions/max_terminated_length": 1238.0,
"completions/mean_length": 718.75,
"completions/mean_terminated_length": 718.75,
"completions/min_length": 482.0,
"completions/min_terminated_length": 482.0,
"epoch": 0.21,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.076208309541439,
"kl": 0.000446319580078125,
"learning_rate": 9.728616793536587e-07,
"loss": -0.0477,
"num_tokens": 308647.0,
"reward": 0.75,
"reward_std": 0.4629100561141968,
"rewards/accuracy_reward/mean": 0.25,
"rewards/accuracy_reward/std": 0.4629100561141968,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1146.0,
"completions/max_terminated_length": 1146.0,
"completions/mean_length": 751.0,
"completions/mean_terminated_length": 751.0,
"completions/min_length": 247.0,
"completions/min_terminated_length": 247.0,
"epoch": 0.22,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6075613594297673,
"kl": 0.0004673004150390625,
"learning_rate": 9.672327345550543e-07,
"loss": -0.2154,
"num_tokens": 315719.0,
"reward": 0.6875,
"reward_std": 0.5303300619125366,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3535533845424652,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5625,
"rewards/tag_count_reward/std": 0.1767766922712326,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1330.0,
"completions/mean_length": 1307.125,
"completions/mean_terminated_length": 862.6000366210938,
"completions/min_length": 635.0,
"completions/min_terminated_length": 635.0,
"epoch": 0.23,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8767085399237937,
"kl": 0.0005292892456054688,
"learning_rate": 9.610954559391704e-07,
"loss": 0.2683,
"num_tokens": 327448.0,
"reward": 0.96875,
"reward_std": 0.5737953186035156,
"rewards/accuracy_reward/mean": 0.5,
"rewards/accuracy_reward/std": 0.5345224738121033,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.46875,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.24,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6467284427973272,
"kl": 0.0005121231079101562,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0,
"num_tokens": 344728.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 1745.0,
"completions/mean_terminated_length": 1442.0,
"completions/min_length": 918.0,
"completions/min_terminated_length": 918.0,
"epoch": 0.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6364191810172123,
"kl": 0.00034999847412109375,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0907,
"num_tokens": 359472.0,
"reward": 0.375,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.375,
"rewards/tag_count_reward/std": 0.13363061845302582,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1109.0,
"completions/max_terminated_length": 1109.0,
"completions/mean_length": 860.125,
"completions/mean_terminated_length": 860.125,
"completions/min_length": 560.0,
"completions/min_terminated_length": 560.0,
"epoch": 0.26,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9557565788184962,
"kl": 0.0004291534423828125,
"learning_rate": 9.397114317029974e-07,
"loss": -0.0065,
"num_tokens": 367057.0,
"reward": 0.625,
"reward_std": 0.3535533845424652,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3535533845424652,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.27,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003114398061108756,
"kl": 0.0005168914794921875,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0,
"num_tokens": 384721.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 1633.625,
"completions/mean_terminated_length": 1495.5,
"completions/min_length": 832.0,
"completions/min_terminated_length": 832.0,
"epoch": 0.28,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8883130168762549,
"kl": 0.000690460205078125,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0,
"num_tokens": 398990.0,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.4375,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1646.0,
"completions/mean_length": 1305.875,
"completions/mean_terminated_length": 1058.5,
"completions/min_length": 766.0,
"completions/min_terminated_length": 766.0,
"epoch": 0.29,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8508879198267415,
"kl": 0.00045108795166015625,
"learning_rate": 9.140576474687263e-07,
"loss": 0.2107,
"num_tokens": 410989.0,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.4375,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 438.0,
"completions/max_terminated_length": 438.0,
"completions/mean_length": 319.875,
"completions/mean_terminated_length": 319.875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.3,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.009396867221837423,
"kl": 0.000469207763671875,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0,
"num_tokens": 414372.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1928.0,
"completions/mean_length": 1646.625,
"completions/mean_terminated_length": 1512.8333740234375,
"completions/min_length": 1172.0,
"completions/min_terminated_length": 1172.0,
"epoch": 0.31,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.677636559478867,
"kl": 0.0004253387451171875,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0789,
"num_tokens": 429153.0,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.4375,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.32,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.42518375588544877,
"kl": 0.000492095947265625,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0,
"num_tokens": 446505.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1485.0,
"completions/mean_length": 1977.625,
"completions/mean_terminated_length": 1485.0,
"completions/min_length": 1485.0,
"completions/min_terminated_length": 1485.0,
"epoch": 0.33,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7573312728335488,
"kl": 0.00067901611328125,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0563,
"num_tokens": 463614.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.34,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0031934478388464358,
"kl": 0.000537872314453125,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0,
"num_tokens": 481382.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.35,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6214744310551946,
"kl": 0.00048160552978515625,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0,
"num_tokens": 498622.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1532.0,
"completions/max_terminated_length": 1532.0,
"completions/mean_length": 792.625,
"completions/mean_terminated_length": 792.625,
"completions/min_length": 586.0,
"completions/min_terminated_length": 586.0,
"epoch": 0.36,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8848488126874285,
"kl": 0.0005741119384765625,
"learning_rate": 8.392544243589427e-07,
"loss": -0.0344,
"num_tokens": 506091.0,
"reward": 0.9375,
"reward_std": 0.47715675830841064,
"rewards/accuracy_reward/mean": 0.25,
"rewards/accuracy_reward/std": 0.4629100561141968,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.6875,
"rewards/tag_count_reward/std": 0.22160132229328156,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.37,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0037357667583392553,
"kl": 0.000514984130859375,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0,
"num_tokens": 523475.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1437.0,
"completions/mean_length": 1464.875,
"completions/mean_terminated_length": 1115.0,
"completions/min_length": 751.0,
"completions/min_terminated_length": 751.0,
"epoch": 0.38,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9255650045041091,
"kl": 0.0004863739013671875,
"learning_rate": 8.145033635316128e-07,
"loss": 0.242,
"num_tokens": 536562.0,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.4375,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1930.0,
"completions/mean_length": 1548.25,
"completions/mean_terminated_length": 1476.857177734375,
"completions/min_length": 996.0,
"completions/min_terminated_length": 996.0,
"epoch": 0.39,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7119712141872488,
"kl": 0.0004978179931640625,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0712,
"num_tokens": 550620.0,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.46875,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.4,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0037004750751766843,
"kl": 0.000919342041015625,
"learning_rate": 7.884636689049422e-07,
"loss": 0.0,
"num_tokens": 568516.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2003.0,
"completions/mean_length": 1684.25,
"completions/mean_terminated_length": 1320.5,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"epoch": 0.41,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6765909685421715,
"kl": 0.0006198883056640625,
"learning_rate": 7.75e-07,
"loss": 0.2158,
"num_tokens": 582934.0,
"reward": 0.375,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.375,
"rewards/tag_count_reward/std": 0.13363061845302582,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1233.0,
"completions/mean_length": 1946.125,
"completions/mean_terminated_length": 1233.0,
"completions/min_length": 1233.0,
"completions/min_terminated_length": 1233.0,
"epoch": 0.42,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6782057995689209,
"kl": 0.000667572021484375,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0878,
"num_tokens": 600151.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.43,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003069913511462463,
"kl": 0.0005235671997070312,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0,
"num_tokens": 617599.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1941.0,
"completions/mean_length": 2034.625,
"completions/mean_terminated_length": 1941.0,
"completions/min_length": 1941.0,
"completions/min_terminated_length": 1941.0,
"epoch": 0.44,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.47430574211354576,
"kl": 0.00039577484130859375,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0073,
"num_tokens": 634780.0,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.3125,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1902.0,
"completions/mean_length": 1884.75,
"completions/mean_terminated_length": 1830.3333740234375,
"completions/min_length": 1705.0,
"completions/min_terminated_length": 1705.0,
"epoch": 0.45,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7613443731007656,
"kl": 0.0006561279296875,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0207,
"num_tokens": 652242.0,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.4375,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1234.0,
"completions/mean_length": 1147.75,
"completions/mean_terminated_length": 847.6666870117188,
"completions/min_length": 612.0,
"completions/min_terminated_length": 612.0,
"epoch": 0.46,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0778276077011943,
"kl": 0.00066375732421875,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0154,
"num_tokens": 662560.0,
"reward": 0.8125,
"reward_std": 0.5786375403404236,
"rewards/accuracy_reward/mean": 0.375,
"rewards/accuracy_reward/std": 0.5175492167472839,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.4375,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1327.0,
"completions/max_terminated_length": 1327.0,
"completions/mean_length": 736.25,
"completions/mean_terminated_length": 736.25,
"completions/min_length": 482.0,
"completions/min_terminated_length": 482.0,
"epoch": 0.47,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7259986577953768,
"kl": 0.00115966796875,
"learning_rate": 6.890576474687263e-07,
"loss": -0.0495,
"num_tokens": 669274.0,
"reward": 0.9375,
"reward_std": 0.4172614812850952,
"rewards/accuracy_reward/mean": 0.25,
"rewards/accuracy_reward/std": 0.4629100561141968,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.6875,
"rewards/tag_count_reward/std": 0.25877460837364197,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1397.0,
"completions/max_terminated_length": 1397.0,
"completions/mean_length": 1060.125,
"completions/mean_terminated_length": 1060.125,
"completions/min_length": 685.0,
"completions/min_terminated_length": 685.0,
"epoch": 0.48,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6442195673577423,
"kl": 0.0004329681396484375,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0696,
"num_tokens": 678611.0,
"reward": 0.5625,
"reward_std": 0.1767766922712326,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5625,
"rewards/tag_count_reward/std": 0.1767766922712326,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.49,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006123254776481617,
"kl": 0.00051116943359375,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0,
"num_tokens": 696635.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1466.0,
"completions/mean_length": 1975.25,
"completions/mean_terminated_length": 1466.0,
"completions/min_length": 1466.0,
"completions/min_terminated_length": 1466.0,
"epoch": 0.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6336529820312526,
"kl": 0.0006256103515625,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0585,
"num_tokens": 713709.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 856.0,
"completions/mean_length": 1899.0,
"completions/mean_terminated_length": 856.0,
"completions/min_length": 856.0,
"completions/min_terminated_length": 856.0,
"epoch": 0.51,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.49637328343233095,
"kl": 0.0006580352783203125,
"learning_rate": 6.281416799501187e-07,
"loss": 0.1451,
"num_tokens": 729901.0,
"reward": 0.40625,
"reward_std": 0.4419417381286621,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3535533845424652,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.0,
"completions/mean_length": 2043.625,
"completions/mean_terminated_length": 2013.0,
"completions/min_length": 2013.0,
"completions/min_terminated_length": 2013.0,
"epoch": 0.52,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7195590756820243,
"kl": 0.00067138671875,
"learning_rate": 6.126278954320294e-07,
"loss": 0.0031,
"num_tokens": 747090.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 901.0,
"completions/max_terminated_length": 901.0,
"completions/mean_length": 565.5,
"completions/mean_terminated_length": 565.5,
"completions/min_length": 320.0,
"completions/min_terminated_length": 320.0,
"epoch": 0.53,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.031718782447743454,
"kl": 0.0008029937744140625,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0,
"num_tokens": 752910.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 851.0,
"completions/mean_length": 1898.375,
"completions/mean_terminated_length": 851.0,
"completions/min_length": 851.0,
"completions/min_terminated_length": 851.0,
"epoch": 0.54,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6988079348841579,
"kl": 0.0009441375732421875,
"learning_rate": 5.813904131848564e-07,
"loss": 0.1114,
"num_tokens": 769249.0,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.3125,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1934.0,
"completions/mean_length": 1715.875,
"completions/mean_terminated_length": 1383.75,
"completions/min_length": 749.0,
"completions/min_terminated_length": 749.0,
"epoch": 0.55,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7609580950694678,
"kl": 0.0007781982421875,
"learning_rate": 5.657047735161255e-07,
"loss": 0.1302,
"num_tokens": 783920.0,
"reward": 0.90625,
"reward_std": 0.6399986147880554,
"rewards/accuracy_reward/mean": 0.5,
"rewards/accuracy_reward/std": 0.5345224738121033,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.40625,
"rewards/tag_count_reward/std": 0.12938730418682098,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 953.0,
"completions/mean_length": 829.625,
"completions/mean_terminated_length": 655.5714721679688,
"completions/min_length": 397.0,
"completions/min_terminated_length": 397.0,
"epoch": 0.56,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1927365377038142,
"kl": 0.001590728759765625,
"learning_rate": 5.5e-07,
"loss": 0.0939,
"num_tokens": 791709.0,
"reward": 0.53125,
"reward_std": 0.2086307406425476,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.53125,
"rewards/tag_count_reward/std": 0.2086307406425476,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1331.0,
"completions/mean_length": 994.5,
"completions/mean_terminated_length": 844.0000610351562,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.57,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.182121142240305,
"kl": 0.0005893707275390625,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0,
"num_tokens": 801057.0,
"reward": 0.625,
"reward_std": 0.2314550280570984,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.625,
"rewards/tag_count_reward/std": 0.2314550280570984,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1057.0,
"completions/max_terminated_length": 1057.0,
"completions/mean_length": 834.25,
"completions/mean_terminated_length": 834.25,
"completions/min_length": 659.0,
"completions/min_terminated_length": 659.0,
"epoch": 0.58,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8842736658535645,
"kl": 0.0005855560302734375,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0177,
"num_tokens": 809131.0,
"reward": 0.875,
"reward_std": 0.5175491571426392,
"rewards/accuracy_reward/mean": 0.375,
"rewards/accuracy_reward/std": 0.5175492167472839,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1438.0,
"completions/max_terminated_length": 1438.0,
"completions/mean_length": 580.25,
"completions/mean_terminated_length": 580.25,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.59,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.947212959421752,
"kl": 0.000576019287109375,
"learning_rate": 5.02962191529556e-07,
"loss": 0.1389,
"num_tokens": 814829.0,
"reward": 0.9375,
"reward_std": 0.45806270837783813,
"rewards/accuracy_reward/mean": 0.25,
"rewards/accuracy_reward/std": 0.4629100561141968,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.6875,
"rewards/tag_count_reward/std": 0.1767766922712326,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 1823.875,
"completions/mean_terminated_length": 1450.3333740234375,
"completions/min_length": 1205.0,
"completions/min_terminated_length": 1205.0,
"epoch": 0.6,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7296721993472337,
"kl": 0.0008335113525390625,
"learning_rate": 4.873721045679706e-07,
"loss": 0.1096,
"num_tokens": 831916.0,
"reward": 0.375,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.375,
"rewards/tag_count_reward/std": 0.13363061845302582,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1916.0,
"completions/mean_length": 1570.0,
"completions/mean_terminated_length": 1410.666748046875,
"completions/min_length": 784.0,
"completions/min_terminated_length": 784.0,
"epoch": 0.61,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6173214568936407,
"kl": 0.000675201416015625,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0118,
"num_tokens": 845372.0,
"reward": 0.46875,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.46875,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 1746.5,
"completions/mean_terminated_length": 1445.0,
"completions/min_length": 1093.0,
"completions/min_terminated_length": 1093.0,
"epoch": 0.62,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6996417102286492,
"kl": 0.0004863739013671875,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0543,
"num_tokens": 860464.0,
"reward": 0.375,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.375,
"rewards/tag_count_reward/std": 0.13363061845302582,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1941.0,
"completions/mean_length": 1849.75,
"completions/mean_terminated_length": 1651.5,
"completions/min_length": 1406.0,
"completions/min_terminated_length": 1406.0,
"epoch": 0.63,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5518151553173074,
"kl": 0.00035190582275390625,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0645,
"num_tokens": 876246.0,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.4375,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1935.0,
"completions/mean_length": 1869.0,
"completions/mean_terminated_length": 1332.0,
"completions/min_length": 729.0,
"completions/min_terminated_length": 729.0,
"epoch": 0.64,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7947269366057148,
"kl": 0.00045108795166015625,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.1176,
"num_tokens": 892662.0,
"reward": 0.375,
"reward_std": 0.13363061845302582,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.375,
"rewards/tag_count_reward/std": 0.13363061845302582,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1946.0,
"completions/mean_length": 1797.0,
"completions/mean_terminated_length": 1378.666748046875,
"completions/min_length": 662.0,
"completions/min_terminated_length": 662.0,
"epoch": 0.65,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7873341627366034,
"kl": 0.001064300537109375,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.1647,
"num_tokens": 908198.0,
"reward": 0.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.34375,
"rewards/tag_count_reward/std": 0.12938730418682098,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1401.0,
"completions/mean_length": 1967.125,
"completions/mean_terminated_length": 1401.0,
"completions/min_length": 1401.0,
"completions/min_terminated_length": 1401.0,
"epoch": 0.66,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6108272802669615,
"kl": 0.0005245208740234375,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0663,
"num_tokens": 924743.0,
"reward": 0.40625,
"reward_std": 0.4419417381286621,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3535533845424652,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 2007.0,
"completions/max_terminated_length": 2007.0,
"completions/mean_length": 1388.0,
"completions/mean_terminated_length": 1388.0,
"completions/min_length": 1120.0,
"completions/min_terminated_length": 1120.0,
"epoch": 0.67,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6923989772553029,
"kl": 0.0004787445068359375,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0752,
"num_tokens": 937455.0,
"reward": 0.625,
"reward_std": 0.2314550280570984,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.625,
"rewards/tag_count_reward/std": 0.2314550280570984,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.68,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5804438368543029,
"kl": 0.0006103515625,
"learning_rate": 3.6696851061588994e-07,
"loss": 0.0,
"num_tokens": 957231.0,
"reward": 0.5625,
"reward_std": 0.5786375403404236,
"rewards/accuracy_reward/mean": 0.25,
"rewards/accuracy_reward/std": 0.4629100561141968,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.3125,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.69,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.602172715608051,
"kl": 0.0004863739013671875,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0,
"num_tokens": 974775.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1173.0,
"completions/mean_length": 1111.75,
"completions/mean_terminated_length": 978.0000610351562,
"completions/min_length": 717.0,
"completions/min_terminated_length": 717.0,
"epoch": 0.7,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5800710876181367,
"kl": 0.0004787445068359375,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0362,
"num_tokens": 985317.0,
"reward": 0.59375,
"reward_std": 0.376485139131546,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3535533845424652,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.46875,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1977.0,
"completions/mean_length": 1826.75,
"completions/mean_terminated_length": 1605.5,
"completions/min_length": 892.0,
"completions/min_terminated_length": 892.0,
"epoch": 0.71,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8588425272525544,
"kl": 0.001007080078125,
"learning_rate": 3.250000000000001e-07,
"loss": 0.025,
"num_tokens": 1001051.0,
"reward": 0.5,
"reward_std": 0.4225771427154541,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3535533845424652,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.375,
"rewards/tag_count_reward/std": 0.13363061845302582,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1592.0,
"completions/mean_length": 1284.0,
"completions/mean_terminated_length": 1174.857177734375,
"completions/min_length": 953.0,
"completions/min_terminated_length": 953.0,
"epoch": 0.72,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.013235371533423931,
"kl": 0.0008678436279296875,
"learning_rate": 3.115363310950578e-07,
"loss": 0.0,
"num_tokens": 1012123.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.0,
"completions/mean_length": 1916.25,
"completions/mean_terminated_length": 1696.666748046875,
"completions/min_length": 1519.0,
"completions/min_terminated_length": 1519.0,
"epoch": 0.73,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5474968619673198,
"kl": 0.000751495361328125,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0,
"num_tokens": 1028421.0,
"reward": 0.65625,
"reward_std": 0.5334774851799011,
"rewards/accuracy_reward/mean": 0.25,
"rewards/accuracy_reward/std": 0.4629100561141968,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.40625,
"rewards/tag_count_reward/std": 0.12938730418682098,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1883.0,
"completions/max_terminated_length": 1883.0,
"completions/mean_length": 1288.75,
"completions/mean_terminated_length": 1288.75,
"completions/min_length": 764.0,
"completions/min_terminated_length": 764.0,
"epoch": 0.74,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.003456095198800055,
"kl": 0.0003814697265625,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0,
"num_tokens": 1039587.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1668.0,
"completions/mean_length": 1781.0,
"completions/mean_terminated_length": 1514.0,
"completions/min_length": 1256.0,
"completions/min_terminated_length": 1256.0,
"epoch": 0.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.548319750098783,
"kl": 0.00042057037353515625,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0938,
"num_tokens": 1054939.0,
"reward": 0.9375,
"reward_std": 0.6087164282798767,
"rewards/accuracy_reward/mean": 0.5,
"rewards/accuracy_reward/std": 0.5345224738121033,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.4375,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.76,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0028496725166757297,
"kl": 0.00051116943359375,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0,
"num_tokens": 1072235.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1943.0,
"completions/mean_length": 1953.375,
"completions/mean_terminated_length": 1669.5,
"completions/min_length": 1396.0,
"completions/min_terminated_length": 1396.0,
"epoch": 0.77,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.706431481153136,
"kl": 0.000545501708984375,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0582,
"num_tokens": 1089334.0,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.3125,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1759.0,
"completions/mean_length": 1863.25,
"completions/mean_terminated_length": 1555.3333740234375,
"completions/min_length": 1322.0,
"completions/min_terminated_length": 1322.0,
"epoch": 0.78,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7026848252787423,
"kl": 0.0008983612060546875,
"learning_rate": 2.374037332934512e-07,
"loss": 0.1001,
"num_tokens": 1105592.0,
"reward": 0.40625,
"reward_std": 0.2651650309562683,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.40625,
"rewards/tag_count_reward/std": 0.2651650309562683,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1333.0,
"completions/max_terminated_length": 1333.0,
"completions/mean_length": 1021.625,
"completions/mean_terminated_length": 1021.625,
"completions/min_length": 679.0,
"completions/min_terminated_length": 679.0,
"epoch": 0.79,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.004479904510744182,
"kl": 0.0004482269287109375,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0,
"num_tokens": 1114469.0,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5,
"rewards/tag_count_reward/std": 0.0,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.8,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.012713748507087811,
"kl": 0.000774383544921875,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0,
"num_tokens": 1132061.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.81,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.009237916379458392,
"kl": 0.00116729736328125,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0,
"num_tokens": 1149405.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.82,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.731282287844213,
"kl": 0.00102996826171875,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.0,
"num_tokens": 1167485.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1497.0,
"completions/mean_length": 1806.375,
"completions/mean_terminated_length": 1403.666748046875,
"completions/min_length": 1279.0,
"completions/min_terminated_length": 1279.0,
"epoch": 0.83,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6696391690257025,
"kl": 0.0007190704345703125,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.1358,
"num_tokens": 1182888.0,
"reward": 0.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.34375,
"rewards/tag_count_reward/std": 0.12938730418682098,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1856.375,
"completions/mean_terminated_length": 1537.0,
"completions/min_length": 1284.0,
"completions/min_terminated_length": 1284.0,
"epoch": 0.84,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8520445078964867,
"kl": 0.00098419189453125,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.1106,
"num_tokens": 1198539.0,
"reward": 0.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.34375,
"rewards/tag_count_reward/std": 0.12938730418682098,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1619.0,
"completions/mean_length": 1552.75,
"completions/mean_terminated_length": 1387.666748046875,
"completions/min_length": 1177.0,
"completions/min_terminated_length": 1177.0,
"epoch": 0.85,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6415780475709305,
"kl": 0.00045013427734375,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.1278,
"num_tokens": 1211897.0,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.4375,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 1301.75,
"completions/mean_terminated_length": 854.0,
"completions/min_length": 401.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.86,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2846817650924338,
"kl": 0.0011138916015625,
"learning_rate": 1.6028856829700258e-07,
"loss": 0.3474,
"num_tokens": 1223335.0,
"reward": 0.4375,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.4375,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1927.0,
"completions/mean_length": 1210.625,
"completions/mean_terminated_length": 1091.0,
"completions/min_length": 543.0,
"completions/min_terminated_length": 543.0,
"epoch": 0.87,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7215403346916941,
"kl": 0.0008296966552734375,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0492,
"num_tokens": 1233980.0,
"reward": 0.53125,
"reward_std": 0.2086307406425476,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.53125,
"rewards/tag_count_reward/std": 0.2086307406425476,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1000.0,
"completions/mean_length": 1143.375,
"completions/mean_terminated_length": 841.8333740234375,
"completions/min_length": 740.0,
"completions/min_terminated_length": 740.0,
"epoch": 0.88,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.888295640678355,
"kl": 0.0009326934814453125,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.203,
"num_tokens": 1244423.0,
"reward": 0.5625,
"reward_std": 0.2912411689758301,
"rewards/accuracy_reward/mean": NaN,
"rewards/accuracy_reward/std": NaN,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.5625,
"rewards/tag_count_reward/std": 0.29124119877815247,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -1.0,
"completions/max_length": 1903.0,
"completions/max_terminated_length": 1903.0,
"completions/mean_length": 1234.0,
"completions/mean_terminated_length": 1234.0,
"completions/min_length": 955.0,
"completions/min_terminated_length": 955.0,
"epoch": 0.89,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4757099764606759,
"kl": 0.000911712646484375,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0644,
"num_tokens": 1255343.0,
"reward": 0.53125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.53125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1968.0,
"completions/mean_length": 2009.625,
"completions/mean_terminated_length": 1894.5,
"completions/min_length": 1821.0,
"completions/min_terminated_length": 1821.0,
"epoch": 0.9,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7840957699122901,
"kl": 0.00072479248046875,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0212,
"num_tokens": 1272652.0,
"reward": 0.3125,
"reward_std": 0.1157275140285492,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.3125,
"rewards/tag_count_reward/std": 0.1157275140285492,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1859.0,
"completions/mean_length": 1858.25,
"completions/mean_terminated_length": 1542.0,
"completions/min_length": 1311.0,
"completions/min_terminated_length": 1311.0,
"epoch": 0.91,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8165269333115026,
"kl": 0.00063323974609375,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0456,
"num_tokens": 1289190.0,
"reward": 0.46875,
"reward_std": 0.4317220449447632,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3535533845424652,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.34375,
"rewards/tag_count_reward/std": 0.12938730418682098,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.92,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6822113621589488,
"kl": 0.000732421875,
"learning_rate": 1.220245676671809e-07,
"loss": 0.0,
"num_tokens": 1307070.0,
"reward": 0.28125,
"reward_std": 0.0883883461356163,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1638.0,
"completions/mean_length": 1996.75,
"completions/mean_terminated_length": 1638.0,
"completions/min_length": 1638.0,
"completions/min_terminated_length": 1638.0,
"epoch": 0.93,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5735308409410357,
"kl": 0.000598907470703125,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0,
"num_tokens": 1324564.0,
"reward": 0.34375,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.34375,
"rewards/tag_count_reward/std": 0.12938730418682098,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.94,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0035277658572437772,
"kl": 0.00074005126953125,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0,
"num_tokens": 1342244.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.95,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0033309614626830077,
"kl": 0.0007114410400390625,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0,
"num_tokens": 1360532.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.96,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.009616790202442917,
"kl": 0.0008449554443359375,
"learning_rate": 1.068365111445064e-07,
"loss": 0.0,
"num_tokens": 1377964.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": -0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1899.0,
"completions/mean_length": 1599.625,
"completions/mean_terminated_length": 1450.166748046875,
"completions/min_length": 1096.0,
"completions/min_terminated_length": 1096.0,
"epoch": 0.97,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6530906969360044,
"kl": 0.0005645751953125,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0652,
"num_tokens": 1391689.0,
"reward": 0.84375,
"reward_std": 0.5499594211578369,
"rewards/accuracy_reward/mean": 0.375,
"rewards/accuracy_reward/std": 0.5175492167472839,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.46875,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.98,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.01164341791205279,
"kl": 0.0011444091796875,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0,
"num_tokens": 1410297.0,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.25,
"rewards/tag_count_reward/std": 0.0,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.99,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6801784484817373,
"kl": 0.0009212493896484375,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0,
"num_tokens": 1427513.0,
"reward": 0.40625,
"reward_std": 0.4419417381286621,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.3535533845424652,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.28125,
"rewards/tag_count_reward/std": 0.0883883461356163,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1996.0,
"completions/mean_length": 1748.625,
"completions/mean_terminated_length": 1449.25,
"completions/min_length": 915.0,
"completions/min_terminated_length": 915.0,
"epoch": 1.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.7939072364235943,
"kl": 0.0008449554443359375,
"learning_rate": 1.002741278414069e-07,
"loss": 0.1071,
"num_tokens": 1442638.0,
"reward": 0.40625,
"reward_std": 0.1293872892856598,
"rewards/accuracy_reward/mean": 0.0,
"rewards/accuracy_reward/std": 0.0,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"rewards/tag_count_reward/mean": 0.40625,
"rewards/tag_count_reward/std": 0.12938730418682098,
"step": 100
},
{
"epoch": 1.0,
"step": 100,
"total_flos": 0.0,
"train_loss": 0.0464448650211034,
"train_runtime": 1230.2242,
"train_samples_per_second": 0.081,
"train_steps_per_second": 0.081
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 1442638,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}