3044 lines
106 KiB
JSON
3044 lines
106 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 100,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1627.0,
|
|
"completions/mean_length": 1840.125,
|
|
"completions/mean_terminated_length": 1493.666748046875,
|
|
"completions/min_length": 1381.0,
|
|
"completions/min_terminated_length": 1381.0,
|
|
"epoch": 0.01,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6628252342045541,
|
|
"kl": 0.0004730224609375,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.0828,
|
|
"num_tokens": 15585.0,
|
|
"reward": 0.53125,
|
|
"reward_std": 0.41052013635635376,
|
|
"rewards/accuracy_reward/mean": 0.125,
|
|
"rewards/accuracy_reward/std": 0.3535533845424652,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.40625,
|
|
"rewards/tag_count_reward/std": 0.12938730418682098,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1674.0,
|
|
"completions/mean_length": 1917.875,
|
|
"completions/mean_terminated_length": 1527.5,
|
|
"completions/min_length": 1381.0,
|
|
"completions/min_terminated_length": 1381.0,
|
|
"epoch": 0.02,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7317782492587136,
|
|
"kl": 0.00067138671875,
|
|
"learning_rate": 1e-07,
|
|
"loss": 0.069,
|
|
"num_tokens": 32088.0,
|
|
"reward": 0.375,
|
|
"reward_std": 0.13363061845302582,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.375,
|
|
"rewards/tag_count_reward/std": 0.13363061845302582,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1641.0,
|
|
"completions/mean_length": 1356.875,
|
|
"completions/mean_terminated_length": 1126.5,
|
|
"completions/min_length": 625.0,
|
|
"completions/min_terminated_length": 625.0,
|
|
"epoch": 0.03,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9091411567234831,
|
|
"kl": 0.0005397796630859375,
|
|
"learning_rate": 2e-07,
|
|
"loss": 0.039,
|
|
"num_tokens": 44431.0,
|
|
"reward": 0.46875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.46875,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.04,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7902192806756456,
|
|
"kl": 0.000720977783203125,
|
|
"learning_rate": 3e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 62215.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1580.0,
|
|
"completions/mean_length": 1989.5,
|
|
"completions/mean_terminated_length": 1580.0,
|
|
"completions/min_length": 1580.0,
|
|
"completions/min_terminated_length": 1580.0,
|
|
"epoch": 0.05,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.4606638644327677,
|
|
"kl": 0.00045490264892578125,
|
|
"learning_rate": 4e-07,
|
|
"loss": 0.0382,
|
|
"num_tokens": 79059.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.4225771427154541,
|
|
"rewards/accuracy_reward/mean": 0.125,
|
|
"rewards/accuracy_reward/std": 0.3535533845424652,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.375,
|
|
"rewards/tag_count_reward/std": 0.13363061845302582,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.06,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0036145388208231963,
|
|
"kl": 0.0007953643798828125,
|
|
"learning_rate": 5e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 96667.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1741.0,
|
|
"completions/mean_length": 1504.875,
|
|
"completions/mean_terminated_length": 1427.2857666015625,
|
|
"completions/min_length": 1116.0,
|
|
"completions/min_terminated_length": 1116.0,
|
|
"epoch": 0.07,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.003359971830136702,
|
|
"kl": 0.000370025634765625,
|
|
"learning_rate": 6e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 110562.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1081.0,
|
|
"completions/max_terminated_length": 1081.0,
|
|
"completions/mean_length": 833.625,
|
|
"completions/mean_terminated_length": 833.625,
|
|
"completions/min_length": 510.0,
|
|
"completions/min_terminated_length": 510.0,
|
|
"epoch": 0.08,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8406033348743903,
|
|
"kl": 0.000423431396484375,
|
|
"learning_rate": 7e-07,
|
|
"loss": 0.0395,
|
|
"num_tokens": 118143.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.1767766922712326,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5625,
|
|
"rewards/tag_count_reward/std": 0.1767766922712326,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1883.0,
|
|
"completions/mean_length": 1935.375,
|
|
"completions/mean_terminated_length": 1597.5,
|
|
"completions/min_length": 1312.0,
|
|
"completions/min_terminated_length": 1312.0,
|
|
"epoch": 0.09,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6507369858144503,
|
|
"kl": 0.0004940032958984375,
|
|
"learning_rate": 8e-07,
|
|
"loss": 0.0704,
|
|
"num_tokens": 134458.0,
|
|
"reward": 0.3125,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.3125,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1694.0,
|
|
"completions/mean_length": 1947.0,
|
|
"completions/mean_terminated_length": 1644.0,
|
|
"completions/min_length": 1594.0,
|
|
"completions/min_terminated_length": 1594.0,
|
|
"epoch": 0.1,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9403904262794942,
|
|
"kl": 0.000942230224609375,
|
|
"learning_rate": 9e-07,
|
|
"loss": 0.0548,
|
|
"num_tokens": 151226.0,
|
|
"reward": 0.375,
|
|
"reward_std": 0.26726123690605164,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.375,
|
|
"rewards/tag_count_reward/std": 0.26726123690605164,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.11,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.003568944942064051,
|
|
"kl": 0.0007343292236328125,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0,
|
|
"num_tokens": 168538.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 819.0,
|
|
"completions/max_terminated_length": 819.0,
|
|
"completions/mean_length": 745.375,
|
|
"completions/mean_terminated_length": 745.375,
|
|
"completions/min_length": 604.0,
|
|
"completions/min_terminated_length": 604.0,
|
|
"epoch": 0.12,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.006363067666064702,
|
|
"kl": 0.00060272216796875,
|
|
"learning_rate": 9.997258721585931e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 175093.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.13,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0029190106524072316,
|
|
"kl": 0.0005397796630859375,
|
|
"learning_rate": 9.989038226169207e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 192629.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1842.0,
|
|
"completions/mean_length": 1707.875,
|
|
"completions/mean_terminated_length": 1594.5,
|
|
"completions/min_length": 1003.0,
|
|
"completions/min_terminated_length": 1003.0,
|
|
"epoch": 0.14,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6809709230610892,
|
|
"kl": 0.00041961669921875,
|
|
"learning_rate": 9.975348529157229e-07,
|
|
"loss": 0.121,
|
|
"num_tokens": 207108.0,
|
|
"reward": 0.46875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.46875,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.15,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0028157580161607284,
|
|
"kl": 0.00044918060302734375,
|
|
"learning_rate": 9.956206309337066e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 224476.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1748.0,
|
|
"completions/mean_length": 1521.0,
|
|
"completions/mean_terminated_length": 1345.3333740234375,
|
|
"completions/min_length": 944.0,
|
|
"completions/min_terminated_length": 944.0,
|
|
"epoch": 0.16,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0032241372357074316,
|
|
"kl": 0.0003681182861328125,
|
|
"learning_rate": 9.931634888554935e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 237716.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1975.0,
|
|
"completions/mean_length": 2038.875,
|
|
"completions/mean_terminated_length": 1975.0,
|
|
"completions/min_length": 1975.0,
|
|
"completions/min_terminated_length": 1975.0,
|
|
"epoch": 0.17,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.5238026110591891,
|
|
"kl": 0.00043487548828125,
|
|
"learning_rate": 9.901664203302124e-07,
|
|
"loss": 0.0064,
|
|
"num_tokens": 254851.0,
|
|
"reward": 0.34375,
|
|
"reward_std": 0.2651650309562683,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.34375,
|
|
"rewards/tag_count_reward/std": 0.2651650309562683,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1946.0,
|
|
"completions/mean_length": 1472.875,
|
|
"completions/mean_terminated_length": 1390.71435546875,
|
|
"completions/min_length": 948.0,
|
|
"completions/min_terminated_length": 948.0,
|
|
"epoch": 0.18,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6969142375215345,
|
|
"kl": 0.000553131103515625,
|
|
"learning_rate": 9.866330768241983e-07,
|
|
"loss": 0.0285,
|
|
"num_tokens": 267802.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.1767766922712326,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5625,
|
|
"rewards/tag_count_reward/std": 0.1767766922712326,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.19,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.006170385130050486,
|
|
"kl": 0.00032806396484375,
|
|
"learning_rate": 9.825677631722435e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 285002.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1876.0,
|
|
"completions/mean_length": 1872.875,
|
|
"completions/mean_terminated_length": 1697.75,
|
|
"completions/min_length": 1359.0,
|
|
"completions/min_terminated_length": 1359.0,
|
|
"epoch": 0.2,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.5924705799517846,
|
|
"kl": 0.000545501708984375,
|
|
"learning_rate": 9.779754323328192e-07,
|
|
"loss": 0.0592,
|
|
"num_tokens": 301433.0,
|
|
"reward": 0.78125,
|
|
"reward_std": 0.60411536693573,
|
|
"rewards/accuracy_reward/mean": 0.375,
|
|
"rewards/accuracy_reward/std": 0.5175492167472839,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.40625,
|
|
"rewards/tag_count_reward/std": 0.12938730418682098,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1238.0,
|
|
"completions/max_terminated_length": 1238.0,
|
|
"completions/mean_length": 718.75,
|
|
"completions/mean_terminated_length": 718.75,
|
|
"completions/min_length": 482.0,
|
|
"completions/min_terminated_length": 482.0,
|
|
"epoch": 0.21,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.076208309541439,
|
|
"kl": 0.000446319580078125,
|
|
"learning_rate": 9.728616793536587e-07,
|
|
"loss": -0.0477,
|
|
"num_tokens": 308647.0,
|
|
"reward": 0.75,
|
|
"reward_std": 0.4629100561141968,
|
|
"rewards/accuracy_reward/mean": 0.25,
|
|
"rewards/accuracy_reward/std": 0.4629100561141968,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1146.0,
|
|
"completions/max_terminated_length": 1146.0,
|
|
"completions/mean_length": 751.0,
|
|
"completions/mean_terminated_length": 751.0,
|
|
"completions/min_length": 247.0,
|
|
"completions/min_terminated_length": 247.0,
|
|
"epoch": 0.22,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.6075613594297673,
|
|
"kl": 0.0004673004150390625,
|
|
"learning_rate": 9.672327345550543e-07,
|
|
"loss": -0.2154,
|
|
"num_tokens": 315719.0,
|
|
"reward": 0.6875,
|
|
"reward_std": 0.5303300619125366,
|
|
"rewards/accuracy_reward/mean": 0.125,
|
|
"rewards/accuracy_reward/std": 0.3535533845424652,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5625,
|
|
"rewards/tag_count_reward/std": 0.1767766922712326,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1330.0,
|
|
"completions/mean_length": 1307.125,
|
|
"completions/mean_terminated_length": 862.6000366210938,
|
|
"completions/min_length": 635.0,
|
|
"completions/min_terminated_length": 635.0,
|
|
"epoch": 0.23,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8767085399237937,
|
|
"kl": 0.0005292892456054688,
|
|
"learning_rate": 9.610954559391704e-07,
|
|
"loss": 0.2683,
|
|
"num_tokens": 327448.0,
|
|
"reward": 0.96875,
|
|
"reward_std": 0.5737953186035156,
|
|
"rewards/accuracy_reward/mean": 0.5,
|
|
"rewards/accuracy_reward/std": 0.5345224738121033,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.46875,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.24,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6467284427973272,
|
|
"kl": 0.0005121231079101562,
|
|
"learning_rate": 9.54457320834625e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 344728.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 2020.0,
|
|
"completions/mean_length": 1745.0,
|
|
"completions/mean_terminated_length": 1442.0,
|
|
"completions/min_length": 918.0,
|
|
"completions/min_terminated_length": 918.0,
|
|
"epoch": 0.25,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6364191810172123,
|
|
"kl": 0.00034999847412109375,
|
|
"learning_rate": 9.473264167865171e-07,
|
|
"loss": 0.0907,
|
|
"num_tokens": 359472.0,
|
|
"reward": 0.375,
|
|
"reward_std": 0.13363061845302582,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.375,
|
|
"rewards/tag_count_reward/std": 0.13363061845302582,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1109.0,
|
|
"completions/max_terminated_length": 1109.0,
|
|
"completions/mean_length": 860.125,
|
|
"completions/mean_terminated_length": 860.125,
|
|
"completions/min_length": 560.0,
|
|
"completions/min_terminated_length": 560.0,
|
|
"epoch": 0.26,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9557565788184962,
|
|
"kl": 0.0004291534423828125,
|
|
"learning_rate": 9.397114317029974e-07,
|
|
"loss": -0.0065,
|
|
"num_tokens": 367057.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.3535533845424652,
|
|
"rewards/accuracy_reward/mean": 0.125,
|
|
"rewards/accuracy_reward/std": 0.3535533845424652,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.27,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.003114398061108756,
|
|
"kl": 0.0005168914794921875,
|
|
"learning_rate": 9.316216432703916e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 384721.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 2047.0,
|
|
"completions/mean_length": 1633.625,
|
|
"completions/mean_terminated_length": 1495.5,
|
|
"completions/min_length": 832.0,
|
|
"completions/min_terminated_length": 832.0,
|
|
"epoch": 0.28,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8883130168762549,
|
|
"kl": 0.000690460205078125,
|
|
"learning_rate": 9.230669076497687e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 398990.0,
|
|
"reward": 0.4375,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.4375,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1646.0,
|
|
"completions/mean_length": 1305.875,
|
|
"completions/mean_terminated_length": 1058.5,
|
|
"completions/min_length": 766.0,
|
|
"completions/min_terminated_length": 766.0,
|
|
"epoch": 0.29,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8508879198267415,
|
|
"kl": 0.00045108795166015625,
|
|
"learning_rate": 9.140576474687263e-07,
|
|
"loss": 0.2107,
|
|
"num_tokens": 410989.0,
|
|
"reward": 0.4375,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.4375,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 438.0,
|
|
"completions/max_terminated_length": 438.0,
|
|
"completions/mean_length": 319.875,
|
|
"completions/mean_terminated_length": 319.875,
|
|
"completions/min_length": 6.0,
|
|
"completions/min_terminated_length": 6.0,
|
|
"epoch": 0.3,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.009396867221837423,
|
|
"kl": 0.000469207763671875,
|
|
"learning_rate": 9.046048391230247e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 414372.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1928.0,
|
|
"completions/mean_length": 1646.625,
|
|
"completions/mean_terminated_length": 1512.8333740234375,
|
|
"completions/min_length": 1172.0,
|
|
"completions/min_terminated_length": 1172.0,
|
|
"epoch": 0.31,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.677636559478867,
|
|
"kl": 0.0004253387451171875,
|
|
"learning_rate": 8.9471999940354e-07,
|
|
"loss": 0.0789,
|
|
"num_tokens": 429153.0,
|
|
"reward": 0.4375,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.4375,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.32,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.42518375588544877,
|
|
"kl": 0.000492095947265625,
|
|
"learning_rate": 8.844151714648274e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 446505.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1485.0,
|
|
"completions/mean_length": 1977.625,
|
|
"completions/mean_terminated_length": 1485.0,
|
|
"completions/min_length": 1485.0,
|
|
"completions/min_terminated_length": 1485.0,
|
|
"epoch": 0.33,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7573312728335488,
|
|
"kl": 0.00067901611328125,
|
|
"learning_rate": 8.737029101523929e-07,
|
|
"loss": 0.0563,
|
|
"num_tokens": 463614.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.34,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0031934478388464358,
|
|
"kl": 0.000537872314453125,
|
|
"learning_rate": 8.625962667065487e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 481382.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.35,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6214744310551946,
|
|
"kl": 0.00048160552978515625,
|
|
"learning_rate": 8.511087728614862e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 498622.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1532.0,
|
|
"completions/max_terminated_length": 1532.0,
|
|
"completions/mean_length": 792.625,
|
|
"completions/mean_terminated_length": 792.625,
|
|
"completions/min_length": 586.0,
|
|
"completions/min_terminated_length": 586.0,
|
|
"epoch": 0.36,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8848488126874285,
|
|
"kl": 0.0005741119384765625,
|
|
"learning_rate": 8.392544243589427e-07,
|
|
"loss": -0.0344,
|
|
"num_tokens": 506091.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.47715675830841064,
|
|
"rewards/accuracy_reward/mean": 0.25,
|
|
"rewards/accuracy_reward/std": 0.4629100561141968,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.6875,
|
|
"rewards/tag_count_reward/std": 0.22160132229328156,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.37,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0037357667583392553,
|
|
"kl": 0.000514984130859375,
|
|
"learning_rate": 8.270476638965461e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 523475.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1437.0,
|
|
"completions/mean_length": 1464.875,
|
|
"completions/mean_terminated_length": 1115.0,
|
|
"completions/min_length": 751.0,
|
|
"completions/min_terminated_length": 751.0,
|
|
"epoch": 0.38,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.9255650045041091,
|
|
"kl": 0.0004863739013671875,
|
|
"learning_rate": 8.145033635316128e-07,
|
|
"loss": 0.242,
|
|
"num_tokens": 536562.0,
|
|
"reward": 0.4375,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.4375,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1930.0,
|
|
"completions/mean_length": 1548.25,
|
|
"completions/mean_terminated_length": 1476.857177734375,
|
|
"completions/min_length": 996.0,
|
|
"completions/min_terminated_length": 996.0,
|
|
"epoch": 0.39,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7119712141872488,
|
|
"kl": 0.0004978179931640625,
|
|
"learning_rate": 8.01636806561836e-07,
|
|
"loss": 0.0712,
|
|
"num_tokens": 550620.0,
|
|
"reward": 0.46875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.46875,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.4,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0037004750751766843,
|
|
"kl": 0.000919342041015625,
|
|
"learning_rate": 7.884636689049422e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 568516.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 2003.0,
|
|
"completions/mean_length": 1684.25,
|
|
"completions/mean_terminated_length": 1320.5,
|
|
"completions/min_length": 1052.0,
|
|
"completions/min_terminated_length": 1052.0,
|
|
"epoch": 0.41,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6765909685421715,
|
|
"kl": 0.0006198883056640625,
|
|
"learning_rate": 7.75e-07,
|
|
"loss": 0.2158,
|
|
"num_tokens": 582934.0,
|
|
"reward": 0.375,
|
|
"reward_std": 0.13363061845302582,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.375,
|
|
"rewards/tag_count_reward/std": 0.13363061845302582,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1233.0,
|
|
"completions/mean_length": 1946.125,
|
|
"completions/mean_terminated_length": 1233.0,
|
|
"completions/min_length": 1233.0,
|
|
"completions/min_terminated_length": 1233.0,
|
|
"epoch": 0.42,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6782057995689209,
|
|
"kl": 0.000667572021484375,
|
|
"learning_rate": 7.612622032536507e-07,
|
|
"loss": 0.0878,
|
|
"num_tokens": 600151.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.43,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.003069913511462463,
|
|
"kl": 0.0005235671997070312,
|
|
"learning_rate": 7.472670160550848e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 617599.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1941.0,
|
|
"completions/mean_length": 2034.625,
|
|
"completions/mean_terminated_length": 1941.0,
|
|
"completions/min_length": 1941.0,
|
|
"completions/min_terminated_length": 1941.0,
|
|
"epoch": 0.44,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.47430574211354576,
|
|
"kl": 0.00039577484130859375,
|
|
"learning_rate": 7.330314893841101e-07,
|
|
"loss": 0.0073,
|
|
"num_tokens": 634780.0,
|
|
"reward": 0.3125,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.3125,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1902.0,
|
|
"completions/mean_length": 1884.75,
|
|
"completions/mean_terminated_length": 1830.3333740234375,
|
|
"completions/min_length": 1705.0,
|
|
"completions/min_terminated_length": 1705.0,
|
|
"epoch": 0.45,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7613443731007656,
|
|
"kl": 0.0006561279296875,
|
|
"learning_rate": 7.185729670371604e-07,
|
|
"loss": 0.0207,
|
|
"num_tokens": 652242.0,
|
|
"reward": 0.4375,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.4375,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1234.0,
|
|
"completions/mean_length": 1147.75,
|
|
"completions/mean_terminated_length": 847.6666870117188,
|
|
"completions/min_length": 612.0,
|
|
"completions/min_terminated_length": 612.0,
|
|
"epoch": 0.46,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.0778276077011943,
|
|
"kl": 0.00066375732421875,
|
|
"learning_rate": 7.039090644965509e-07,
|
|
"loss": 0.0154,
|
|
"num_tokens": 662560.0,
|
|
"reward": 0.8125,
|
|
"reward_std": 0.5786375403404236,
|
|
"rewards/accuracy_reward/mean": 0.375,
|
|
"rewards/accuracy_reward/std": 0.5175492167472839,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.4375,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1327.0,
|
|
"completions/max_terminated_length": 1327.0,
|
|
"completions/mean_length": 736.25,
|
|
"completions/mean_terminated_length": 736.25,
|
|
"completions/min_length": 482.0,
|
|
"completions/min_terminated_length": 482.0,
|
|
"epoch": 0.47,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.7259986577953768,
|
|
"kl": 0.00115966796875,
|
|
"learning_rate": 6.890576474687263e-07,
|
|
"loss": -0.0495,
|
|
"num_tokens": 669274.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.4172614812850952,
|
|
"rewards/accuracy_reward/mean": 0.25,
|
|
"rewards/accuracy_reward/std": 0.4629100561141968,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.6875,
|
|
"rewards/tag_count_reward/std": 0.25877460837364197,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1397.0,
|
|
"completions/max_terminated_length": 1397.0,
|
|
"completions/mean_length": 1060.125,
|
|
"completions/mean_terminated_length": 1060.125,
|
|
"completions/min_length": 685.0,
|
|
"completions/min_terminated_length": 685.0,
|
|
"epoch": 0.48,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6442195673577423,
|
|
"kl": 0.0004329681396484375,
|
|
"learning_rate": 6.740368101176495e-07,
|
|
"loss": 0.0696,
|
|
"num_tokens": 678611.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.1767766922712326,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5625,
|
|
"rewards/tag_count_reward/std": 0.1767766922712326,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.49,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.006123254776481617,
|
|
"kl": 0.00051116943359375,
|
|
"learning_rate": 6.588648530198504e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 696635.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1466.0,
|
|
"completions/mean_length": 1975.25,
|
|
"completions/mean_terminated_length": 1466.0,
|
|
"completions/min_length": 1466.0,
|
|
"completions/min_terminated_length": 1466.0,
|
|
"epoch": 0.5,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6336529820312526,
|
|
"kl": 0.0006256103515625,
|
|
"learning_rate": 6.435602608679916e-07,
|
|
"loss": 0.0585,
|
|
"num_tokens": 713709.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 856.0,
|
|
"completions/mean_length": 1899.0,
|
|
"completions/mean_terminated_length": 856.0,
|
|
"completions/min_length": 856.0,
|
|
"completions/min_terminated_length": 856.0,
|
|
"epoch": 0.51,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.49637328343233095,
|
|
"kl": 0.0006580352783203125,
|
|
"learning_rate": 6.281416799501187e-07,
|
|
"loss": 0.1451,
|
|
"num_tokens": 729901.0,
|
|
"reward": 0.40625,
|
|
"reward_std": 0.4419417381286621,
|
|
"rewards/accuracy_reward/mean": 0.125,
|
|
"rewards/accuracy_reward/std": 0.3535533845424652,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 51
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 2013.0,
|
|
"completions/mean_length": 2043.625,
|
|
"completions/mean_terminated_length": 2013.0,
|
|
"completions/min_length": 2013.0,
|
|
"completions/min_terminated_length": 2013.0,
|
|
"epoch": 0.52,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7195590756820243,
|
|
"kl": 0.00067138671875,
|
|
"learning_rate": 6.126278954320294e-07,
|
|
"loss": 0.0031,
|
|
"num_tokens": 747090.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 52
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 901.0,
|
|
"completions/max_terminated_length": 901.0,
|
|
"completions/mean_length": 565.5,
|
|
"completions/mean_terminated_length": 565.5,
|
|
"completions/min_length": 320.0,
|
|
"completions/min_terminated_length": 320.0,
|
|
"epoch": 0.53,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.031718782447743454,
|
|
"kl": 0.0008029937744140625,
|
|
"learning_rate": 5.97037808470444e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 752910.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 53
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 851.0,
|
|
"completions/mean_length": 1898.375,
|
|
"completions/mean_terminated_length": 851.0,
|
|
"completions/min_length": 851.0,
|
|
"completions/min_terminated_length": 851.0,
|
|
"epoch": 0.54,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6988079348841579,
|
|
"kl": 0.0009441375732421875,
|
|
"learning_rate": 5.813904131848564e-07,
|
|
"loss": 0.1114,
|
|
"num_tokens": 769249.0,
|
|
"reward": 0.3125,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.3125,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 54
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1934.0,
|
|
"completions/mean_length": 1715.875,
|
|
"completions/mean_terminated_length": 1383.75,
|
|
"completions/min_length": 749.0,
|
|
"completions/min_terminated_length": 749.0,
|
|
"epoch": 0.55,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7609580950694678,
|
|
"kl": 0.0007781982421875,
|
|
"learning_rate": 5.657047735161255e-07,
|
|
"loss": 0.1302,
|
|
"num_tokens": 783920.0,
|
|
"reward": 0.90625,
|
|
"reward_std": 0.6399986147880554,
|
|
"rewards/accuracy_reward/mean": 0.5,
|
|
"rewards/accuracy_reward/std": 0.5345224738121033,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.40625,
|
|
"rewards/tag_count_reward/std": 0.12938730418682098,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 953.0,
|
|
"completions/mean_length": 829.625,
|
|
"completions/mean_terminated_length": 655.5714721679688,
|
|
"completions/min_length": 397.0,
|
|
"completions/min_terminated_length": 397.0,
|
|
"epoch": 0.56,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.1927365377038142,
|
|
"kl": 0.001590728759765625,
|
|
"learning_rate": 5.5e-07,
|
|
"loss": 0.0939,
|
|
"num_tokens": 791709.0,
|
|
"reward": 0.53125,
|
|
"reward_std": 0.2086307406425476,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.53125,
|
|
"rewards/tag_count_reward/std": 0.2086307406425476,
|
|
"step": 56
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1331.0,
|
|
"completions/mean_length": 994.5,
|
|
"completions/mean_terminated_length": 844.0000610351562,
|
|
"completions/min_length": 385.0,
|
|
"completions/min_terminated_length": 385.0,
|
|
"epoch": 0.57,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.182121142240305,
|
|
"kl": 0.0005893707275390625,
|
|
"learning_rate": 5.342952264838747e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 801057.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.2314550280570984,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.625,
|
|
"rewards/tag_count_reward/std": 0.2314550280570984,
|
|
"step": 57
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1057.0,
|
|
"completions/max_terminated_length": 1057.0,
|
|
"completions/mean_length": 834.25,
|
|
"completions/mean_terminated_length": 834.25,
|
|
"completions/min_length": 659.0,
|
|
"completions/min_terminated_length": 659.0,
|
|
"epoch": 0.58,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8842736658535645,
|
|
"kl": 0.0005855560302734375,
|
|
"learning_rate": 5.186095868151436e-07,
|
|
"loss": 0.0177,
|
|
"num_tokens": 809131.0,
|
|
"reward": 0.875,
|
|
"reward_std": 0.5175491571426392,
|
|
"rewards/accuracy_reward/mean": 0.375,
|
|
"rewards/accuracy_reward/std": 0.5175492167472839,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 58
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1438.0,
|
|
"completions/max_terminated_length": 1438.0,
|
|
"completions/mean_length": 580.25,
|
|
"completions/mean_terminated_length": 580.25,
|
|
"completions/min_length": 190.0,
|
|
"completions/min_terminated_length": 190.0,
|
|
"epoch": 0.59,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.947212959421752,
|
|
"kl": 0.000576019287109375,
|
|
"learning_rate": 5.02962191529556e-07,
|
|
"loss": 0.1389,
|
|
"num_tokens": 814829.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.45806270837783813,
|
|
"rewards/accuracy_reward/mean": 0.25,
|
|
"rewards/accuracy_reward/std": 0.4629100561141968,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.6875,
|
|
"rewards/tag_count_reward/std": 0.1767766922712326,
|
|
"step": 59
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1808.0,
|
|
"completions/mean_length": 1823.875,
|
|
"completions/mean_terminated_length": 1450.3333740234375,
|
|
"completions/min_length": 1205.0,
|
|
"completions/min_terminated_length": 1205.0,
|
|
"epoch": 0.6,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7296721993472337,
|
|
"kl": 0.0008335113525390625,
|
|
"learning_rate": 4.873721045679706e-07,
|
|
"loss": 0.1096,
|
|
"num_tokens": 831916.0,
|
|
"reward": 0.375,
|
|
"reward_std": 0.13363061845302582,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.375,
|
|
"rewards/tag_count_reward/std": 0.13363061845302582,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1916.0,
|
|
"completions/mean_length": 1570.0,
|
|
"completions/mean_terminated_length": 1410.666748046875,
|
|
"completions/min_length": 784.0,
|
|
"completions/min_terminated_length": 784.0,
|
|
"epoch": 0.61,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6173214568936407,
|
|
"kl": 0.000675201416015625,
|
|
"learning_rate": 4.7185832004988133e-07,
|
|
"loss": 0.0118,
|
|
"num_tokens": 845372.0,
|
|
"reward": 0.46875,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.46875,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 61
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 2010.0,
|
|
"completions/mean_length": 1746.5,
|
|
"completions/mean_terminated_length": 1445.0,
|
|
"completions/min_length": 1093.0,
|
|
"completions/min_terminated_length": 1093.0,
|
|
"epoch": 0.62,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6996417102286492,
|
|
"kl": 0.0004863739013671875,
|
|
"learning_rate": 4.5643973913200837e-07,
|
|
"loss": 0.0543,
|
|
"num_tokens": 860464.0,
|
|
"reward": 0.375,
|
|
"reward_std": 0.13363061845302582,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.375,
|
|
"rewards/tag_count_reward/std": 0.13363061845302582,
|
|
"step": 62
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1941.0,
|
|
"completions/mean_length": 1849.75,
|
|
"completions/mean_terminated_length": 1651.5,
|
|
"completions/min_length": 1406.0,
|
|
"completions/min_terminated_length": 1406.0,
|
|
"epoch": 0.63,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.5518151553173074,
|
|
"kl": 0.00035190582275390625,
|
|
"learning_rate": 4.4113514698014953e-07,
|
|
"loss": 0.0645,
|
|
"num_tokens": 876246.0,
|
|
"reward": 0.4375,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.4375,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 63
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1935.0,
|
|
"completions/mean_length": 1869.0,
|
|
"completions/mean_terminated_length": 1332.0,
|
|
"completions/min_length": 729.0,
|
|
"completions/min_terminated_length": 729.0,
|
|
"epoch": 0.64,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7947269366057148,
|
|
"kl": 0.00045108795166015625,
|
|
"learning_rate": 4.2596318988235037e-07,
|
|
"loss": 0.1176,
|
|
"num_tokens": 892662.0,
|
|
"reward": 0.375,
|
|
"reward_std": 0.13363061845302582,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.375,
|
|
"rewards/tag_count_reward/std": 0.13363061845302582,
|
|
"step": 64
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1946.0,
|
|
"completions/mean_length": 1797.0,
|
|
"completions/mean_terminated_length": 1378.666748046875,
|
|
"completions/min_length": 662.0,
|
|
"completions/min_terminated_length": 662.0,
|
|
"epoch": 0.65,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7873341627366034,
|
|
"kl": 0.001064300537109375,
|
|
"learning_rate": 4.1094235253127374e-07,
|
|
"loss": 0.1647,
|
|
"num_tokens": 908198.0,
|
|
"reward": 0.34375,
|
|
"reward_std": 0.1293872892856598,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.34375,
|
|
"rewards/tag_count_reward/std": 0.12938730418682098,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1401.0,
|
|
"completions/mean_length": 1967.125,
|
|
"completions/mean_terminated_length": 1401.0,
|
|
"completions/min_length": 1401.0,
|
|
"completions/min_terminated_length": 1401.0,
|
|
"epoch": 0.66,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6108272802669615,
|
|
"kl": 0.0005245208740234375,
|
|
"learning_rate": 3.9609093550344907e-07,
|
|
"loss": 0.0663,
|
|
"num_tokens": 924743.0,
|
|
"reward": 0.40625,
|
|
"reward_std": 0.4419417381286621,
|
|
"rewards/accuracy_reward/mean": 0.125,
|
|
"rewards/accuracy_reward/std": 0.3535533845424652,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 66
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 2007.0,
|
|
"completions/max_terminated_length": 2007.0,
|
|
"completions/mean_length": 1388.0,
|
|
"completions/mean_terminated_length": 1388.0,
|
|
"completions/min_length": 1120.0,
|
|
"completions/min_terminated_length": 1120.0,
|
|
"epoch": 0.67,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6923989772553029,
|
|
"kl": 0.0004787445068359375,
|
|
"learning_rate": 3.8142703296283953e-07,
|
|
"loss": 0.0752,
|
|
"num_tokens": 937455.0,
|
|
"reward": 0.625,
|
|
"reward_std": 0.2314550280570984,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.625,
|
|
"rewards/tag_count_reward/std": 0.2314550280570984,
|
|
"step": 67
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.68,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.5804438368543029,
|
|
"kl": 0.0006103515625,
|
|
"learning_rate": 3.6696851061588994e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 957231.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.5786375403404236,
|
|
"rewards/accuracy_reward/mean": 0.25,
|
|
"rewards/accuracy_reward/std": 0.4629100561141968,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.3125,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 68
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.69,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.602172715608051,
|
|
"kl": 0.0004863739013671875,
|
|
"learning_rate": 3.5273298394491515e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 974775.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 69
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1173.0,
|
|
"completions/mean_length": 1111.75,
|
|
"completions/mean_terminated_length": 978.0000610351562,
|
|
"completions/min_length": 717.0,
|
|
"completions/min_terminated_length": 717.0,
|
|
"epoch": 0.7,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.5800710876181367,
|
|
"kl": 0.0004787445068359375,
|
|
"learning_rate": 3.387377967463493e-07,
|
|
"loss": 0.0362,
|
|
"num_tokens": 985317.0,
|
|
"reward": 0.59375,
|
|
"reward_std": 0.376485139131546,
|
|
"rewards/accuracy_reward/mean": 0.125,
|
|
"rewards/accuracy_reward/std": 0.3535533845424652,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.46875,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1977.0,
|
|
"completions/mean_length": 1826.75,
|
|
"completions/mean_terminated_length": 1605.5,
|
|
"completions/min_length": 892.0,
|
|
"completions/min_terminated_length": 892.0,
|
|
"epoch": 0.71,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8588425272525544,
|
|
"kl": 0.001007080078125,
|
|
"learning_rate": 3.250000000000001e-07,
|
|
"loss": 0.025,
|
|
"num_tokens": 1001051.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.4225771427154541,
|
|
"rewards/accuracy_reward/mean": 0.125,
|
|
"rewards/accuracy_reward/std": 0.3535533845424652,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.375,
|
|
"rewards/tag_count_reward/std": 0.13363061845302582,
|
|
"step": 71
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1592.0,
|
|
"completions/mean_length": 1284.0,
|
|
"completions/mean_terminated_length": 1174.857177734375,
|
|
"completions/min_length": 953.0,
|
|
"completions/min_terminated_length": 953.0,
|
|
"epoch": 0.72,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.013235371533423931,
|
|
"kl": 0.0008678436279296875,
|
|
"learning_rate": 3.115363310950578e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1012123.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 72
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 2024.0,
|
|
"completions/mean_length": 1916.25,
|
|
"completions/mean_terminated_length": 1696.666748046875,
|
|
"completions/min_length": 1519.0,
|
|
"completions/min_terminated_length": 1519.0,
|
|
"epoch": 0.73,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.5474968619673198,
|
|
"kl": 0.000751495361328125,
|
|
"learning_rate": 2.9836319343816397e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1028421.0,
|
|
"reward": 0.65625,
|
|
"reward_std": 0.5334774851799011,
|
|
"rewards/accuracy_reward/mean": 0.25,
|
|
"rewards/accuracy_reward/std": 0.4629100561141968,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.40625,
|
|
"rewards/tag_count_reward/std": 0.12938730418682098,
|
|
"step": 73
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1883.0,
|
|
"completions/max_terminated_length": 1883.0,
|
|
"completions/mean_length": 1288.75,
|
|
"completions/mean_terminated_length": 1288.75,
|
|
"completions/min_length": 764.0,
|
|
"completions/min_terminated_length": 764.0,
|
|
"epoch": 0.74,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.003456095198800055,
|
|
"kl": 0.0003814697265625,
|
|
"learning_rate": 2.854966364683872e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1039587.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 74
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1668.0,
|
|
"completions/mean_length": 1781.0,
|
|
"completions/mean_terminated_length": 1514.0,
|
|
"completions/min_length": 1256.0,
|
|
"completions/min_terminated_length": 1256.0,
|
|
"epoch": 0.75,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.548319750098783,
|
|
"kl": 0.00042057037353515625,
|
|
"learning_rate": 2.729523361034538e-07,
|
|
"loss": 0.0938,
|
|
"num_tokens": 1054939.0,
|
|
"reward": 0.9375,
|
|
"reward_std": 0.6087164282798767,
|
|
"rewards/accuracy_reward/mean": 0.5,
|
|
"rewards/accuracy_reward/std": 0.5345224738121033,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.4375,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.76,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0028496725166757297,
|
|
"kl": 0.00051116943359375,
|
|
"learning_rate": 2.6074557564105724e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1072235.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 76
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1943.0,
|
|
"completions/mean_length": 1953.375,
|
|
"completions/mean_terminated_length": 1669.5,
|
|
"completions/min_length": 1396.0,
|
|
"completions/min_terminated_length": 1396.0,
|
|
"epoch": 0.77,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.706431481153136,
|
|
"kl": 0.000545501708984375,
|
|
"learning_rate": 2.488912271385139e-07,
|
|
"loss": 0.0582,
|
|
"num_tokens": 1089334.0,
|
|
"reward": 0.3125,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.3125,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 77
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1759.0,
|
|
"completions/mean_length": 1863.25,
|
|
"completions/mean_terminated_length": 1555.3333740234375,
|
|
"completions/min_length": 1322.0,
|
|
"completions/min_terminated_length": 1322.0,
|
|
"epoch": 0.78,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7026848252787423,
|
|
"kl": 0.0008983612060546875,
|
|
"learning_rate": 2.374037332934512e-07,
|
|
"loss": 0.1001,
|
|
"num_tokens": 1105592.0,
|
|
"reward": 0.40625,
|
|
"reward_std": 0.2651650309562683,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.40625,
|
|
"rewards/tag_count_reward/std": 0.2651650309562683,
|
|
"step": 78
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1333.0,
|
|
"completions/max_terminated_length": 1333.0,
|
|
"completions/mean_length": 1021.625,
|
|
"completions/mean_terminated_length": 1021.625,
|
|
"completions/min_length": 679.0,
|
|
"completions/min_terminated_length": 679.0,
|
|
"epoch": 0.79,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.004479904510744182,
|
|
"kl": 0.0004482269287109375,
|
|
"learning_rate": 2.2629708984760706e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1114469.0,
|
|
"reward": 0.5,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 79
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.8,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.012713748507087811,
|
|
"kl": 0.000774383544921875,
|
|
"learning_rate": 2.1558482853517253e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1132061.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.81,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.009237916379458392,
|
|
"kl": 0.00116729736328125,
|
|
"learning_rate": 2.0528000059645995e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1149405.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 81
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.82,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.731282287844213,
|
|
"kl": 0.00102996826171875,
|
|
"learning_rate": 1.9539516087697517e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1167485.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 82
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1497.0,
|
|
"completions/mean_length": 1806.375,
|
|
"completions/mean_terminated_length": 1403.666748046875,
|
|
"completions/min_length": 1279.0,
|
|
"completions/min_terminated_length": 1279.0,
|
|
"epoch": 0.83,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6696391690257025,
|
|
"kl": 0.0007190704345703125,
|
|
"learning_rate": 1.8594235253127372e-07,
|
|
"loss": 0.1358,
|
|
"num_tokens": 1182888.0,
|
|
"reward": 0.34375,
|
|
"reward_std": 0.1293872892856598,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.34375,
|
|
"rewards/tag_count_reward/std": 0.12938730418682098,
|
|
"step": 83
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 2029.0,
|
|
"completions/mean_length": 1856.375,
|
|
"completions/mean_terminated_length": 1537.0,
|
|
"completions/min_length": 1284.0,
|
|
"completions/min_terminated_length": 1284.0,
|
|
"epoch": 0.84,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8520445078964867,
|
|
"kl": 0.00098419189453125,
|
|
"learning_rate": 1.7693309235023127e-07,
|
|
"loss": 0.1106,
|
|
"num_tokens": 1198539.0,
|
|
"reward": 0.34375,
|
|
"reward_std": 0.1293872892856598,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.34375,
|
|
"rewards/tag_count_reward/std": 0.12938730418682098,
|
|
"step": 84
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1619.0,
|
|
"completions/mean_length": 1552.75,
|
|
"completions/mean_terminated_length": 1387.666748046875,
|
|
"completions/min_length": 1177.0,
|
|
"completions/min_terminated_length": 1177.0,
|
|
"epoch": 0.85,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6415780475709305,
|
|
"kl": 0.00045013427734375,
|
|
"learning_rate": 1.6837835672960831e-07,
|
|
"loss": 0.1278,
|
|
"num_tokens": 1211897.0,
|
|
"reward": 0.4375,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.4375,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1785.0,
|
|
"completions/mean_length": 1301.75,
|
|
"completions/mean_terminated_length": 854.0,
|
|
"completions/min_length": 401.0,
|
|
"completions/min_terminated_length": 401.0,
|
|
"epoch": 0.86,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 1.2846817650924338,
|
|
"kl": 0.0011138916015625,
|
|
"learning_rate": 1.6028856829700258e-07,
|
|
"loss": 0.3474,
|
|
"num_tokens": 1223335.0,
|
|
"reward": 0.4375,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.4375,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 86
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1927.0,
|
|
"completions/mean_length": 1210.625,
|
|
"completions/mean_terminated_length": 1091.0,
|
|
"completions/min_length": 543.0,
|
|
"completions/min_terminated_length": 543.0,
|
|
"epoch": 0.87,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7215403346916941,
|
|
"kl": 0.0008296966552734375,
|
|
"learning_rate": 1.5267358321348285e-07,
|
|
"loss": 0.0492,
|
|
"num_tokens": 1233980.0,
|
|
"reward": 0.53125,
|
|
"reward_std": 0.2086307406425476,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.53125,
|
|
"rewards/tag_count_reward/std": 0.2086307406425476,
|
|
"step": 87
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1000.0,
|
|
"completions/mean_length": 1143.375,
|
|
"completions/mean_terminated_length": 841.8333740234375,
|
|
"completions/min_length": 740.0,
|
|
"completions/min_terminated_length": 740.0,
|
|
"epoch": 0.88,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.888295640678355,
|
|
"kl": 0.0009326934814453125,
|
|
"learning_rate": 1.4554267916537495e-07,
|
|
"loss": 0.203,
|
|
"num_tokens": 1244423.0,
|
|
"reward": 0.5625,
|
|
"reward_std": 0.2912411689758301,
|
|
"rewards/accuracy_reward/mean": NaN,
|
|
"rewards/accuracy_reward/std": NaN,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.5625,
|
|
"rewards/tag_count_reward/std": 0.29124119877815247,
|
|
"step": 88
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -1.0,
|
|
"completions/max_length": 1903.0,
|
|
"completions/max_terminated_length": 1903.0,
|
|
"completions/mean_length": 1234.0,
|
|
"completions/mean_terminated_length": 1234.0,
|
|
"completions/min_length": 955.0,
|
|
"completions/min_terminated_length": 955.0,
|
|
"epoch": 0.89,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.4757099764606759,
|
|
"kl": 0.000911712646484375,
|
|
"learning_rate": 1.3890454406082956e-07,
|
|
"loss": 0.0644,
|
|
"num_tokens": 1255343.0,
|
|
"reward": 0.53125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.53125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 89
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1968.0,
|
|
"completions/mean_length": 2009.625,
|
|
"completions/mean_terminated_length": 1894.5,
|
|
"completions/min_length": 1821.0,
|
|
"completions/min_terminated_length": 1821.0,
|
|
"epoch": 0.9,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7840957699122901,
|
|
"kl": 0.00072479248046875,
|
|
"learning_rate": 1.3276726544494571e-07,
|
|
"loss": 0.0212,
|
|
"num_tokens": 1272652.0,
|
|
"reward": 0.3125,
|
|
"reward_std": 0.1157275140285492,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.3125,
|
|
"rewards/tag_count_reward/std": 0.1157275140285492,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.25,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1859.0,
|
|
"completions/mean_length": 1858.25,
|
|
"completions/mean_terminated_length": 1542.0,
|
|
"completions/min_length": 1311.0,
|
|
"completions/min_terminated_length": 1311.0,
|
|
"epoch": 0.91,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.8165269333115026,
|
|
"kl": 0.00063323974609375,
|
|
"learning_rate": 1.2713832064634125e-07,
|
|
"loss": 0.0456,
|
|
"num_tokens": 1289190.0,
|
|
"reward": 0.46875,
|
|
"reward_std": 0.4317220449447632,
|
|
"rewards/accuracy_reward/mean": 0.125,
|
|
"rewards/accuracy_reward/std": 0.3535533845424652,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.34375,
|
|
"rewards/tag_count_reward/std": 0.12938730418682098,
|
|
"step": 91
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.92,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6822113621589488,
|
|
"kl": 0.000732421875,
|
|
"learning_rate": 1.220245676671809e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1307070.0,
|
|
"reward": 0.28125,
|
|
"reward_std": 0.0883883461356163,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 92
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.75,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1638.0,
|
|
"completions/mean_length": 1996.75,
|
|
"completions/mean_terminated_length": 1638.0,
|
|
"completions/min_length": 1638.0,
|
|
"completions/min_terminated_length": 1638.0,
|
|
"epoch": 0.93,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.5735308409410357,
|
|
"kl": 0.000598907470703125,
|
|
"learning_rate": 1.1743223682775649e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1324564.0,
|
|
"reward": 0.34375,
|
|
"reward_std": 0.1293872892856598,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.34375,
|
|
"rewards/tag_count_reward/std": 0.12938730418682098,
|
|
"step": 93
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.94,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0035277658572437772,
|
|
"kl": 0.00074005126953125,
|
|
"learning_rate": 1.1336692317580158e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1342244.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 94
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.95,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.0033309614626830077,
|
|
"kl": 0.0007114410400390625,
|
|
"learning_rate": 1.0983357966978745e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1360532.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.96,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.009616790202442917,
|
|
"kl": 0.0008449554443359375,
|
|
"learning_rate": 1.068365111445064e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1377964.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 96
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": -0.5,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1899.0,
|
|
"completions/mean_length": 1599.625,
|
|
"completions/mean_terminated_length": 1450.166748046875,
|
|
"completions/min_length": 1096.0,
|
|
"completions/min_terminated_length": 1096.0,
|
|
"epoch": 0.97,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6530906969360044,
|
|
"kl": 0.0005645751953125,
|
|
"learning_rate": 1.0437936906629334e-07,
|
|
"loss": 0.0652,
|
|
"num_tokens": 1391689.0,
|
|
"reward": 0.84375,
|
|
"reward_std": 0.5499594211578369,
|
|
"rewards/accuracy_reward/mean": 0.375,
|
|
"rewards/accuracy_reward/std": 0.5175492167472839,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.46875,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 97
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.98,
|
|
"frac_reward_zero_std": 1.0,
|
|
"grad_norm": 0.01164341791205279,
|
|
"kl": 0.0011444091796875,
|
|
"learning_rate": 1.0246514708427701e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1410297.0,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.25,
|
|
"rewards/tag_count_reward/std": 0.0,
|
|
"step": 98
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 1.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 0.0,
|
|
"completions/mean_length": 2048.0,
|
|
"completions/mean_terminated_length": 0.0,
|
|
"completions/min_length": 2048.0,
|
|
"completions/min_terminated_length": 0.0,
|
|
"epoch": 0.99,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.6801784484817373,
|
|
"kl": 0.0009212493896484375,
|
|
"learning_rate": 1.0109617738307911e-07,
|
|
"loss": 0.0,
|
|
"num_tokens": 1427513.0,
|
|
"reward": 0.40625,
|
|
"reward_std": 0.4419417381286621,
|
|
"rewards/accuracy_reward/mean": 0.125,
|
|
"rewards/accuracy_reward/std": 0.3535533845424652,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.28125,
|
|
"rewards/tag_count_reward/std": 0.0883883461356163,
|
|
"step": 99
|
|
},
|
|
{
|
|
"clip_ratio/high_max": 0.0,
|
|
"clip_ratio/high_mean": 0.0,
|
|
"clip_ratio/low_mean": 0.0,
|
|
"clip_ratio/low_min": 0.0,
|
|
"clip_ratio/region_mean": 0.0,
|
|
"completions/clipped_ratio": 0.0,
|
|
"completions/max_length": 2048.0,
|
|
"completions/max_terminated_length": 1996.0,
|
|
"completions/mean_length": 1748.625,
|
|
"completions/mean_terminated_length": 1449.25,
|
|
"completions/min_length": 915.0,
|
|
"completions/min_terminated_length": 915.0,
|
|
"epoch": 1.0,
|
|
"frac_reward_zero_std": 0.0,
|
|
"grad_norm": 0.7939072364235943,
|
|
"kl": 0.0008449554443359375,
|
|
"learning_rate": 1.002741278414069e-07,
|
|
"loss": 0.1071,
|
|
"num_tokens": 1442638.0,
|
|
"reward": 0.40625,
|
|
"reward_std": 0.1293872892856598,
|
|
"rewards/accuracy_reward/mean": 0.0,
|
|
"rewards/accuracy_reward/std": 0.0,
|
|
"rewards/format_reward/mean": 0.0,
|
|
"rewards/format_reward/std": 0.0,
|
|
"rewards/tag_count_reward/mean": 0.40625,
|
|
"rewards/tag_count_reward/std": 0.12938730418682098,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 100,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.0464448650211034,
|
|
"train_runtime": 1230.2242,
|
|
"train_samples_per_second": 0.081,
|
|
"train_steps_per_second": 0.081
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 100,
|
|
"num_input_tokens_seen": 1442638,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 2,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|