2313 lines
86 KiB
JSON
2313 lines
86 KiB
JSON
|
|
{
|
||
|
|
"phase": "sft+grpo",
|
||
|
|
"sft_history": [
|
||
|
|
{
|
||
|
|
"loss": 1.8329996109008788,
|
||
|
|
"grad_norm": 2.6284756660461426,
|
||
|
|
"learning_rate": 2.6666666666666667e-05,
|
||
|
|
"epoch": 0.16666666666666666,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.641743278503418,
|
||
|
|
"grad_norm": 0.9074174761772156,
|
||
|
|
"learning_rate": 6e-05,
|
||
|
|
"epoch": 0.3333333333333333,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3325251579284667,
|
||
|
|
"grad_norm": 0.772527277469635,
|
||
|
|
"learning_rate": 9.333333333333334e-05,
|
||
|
|
"epoch": 0.5,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.908332347869873,
|
||
|
|
"grad_norm": 0.8558230400085449,
|
||
|
|
"learning_rate": 0.00012666666666666666,
|
||
|
|
"epoch": 0.6666666666666666,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.4191232204437256,
|
||
|
|
"grad_norm": 0.6383947134017944,
|
||
|
|
"learning_rate": 0.00016,
|
||
|
|
"epoch": 0.8333333333333334,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.20252063274383544,
|
||
|
|
"grad_norm": 0.24536560475826263,
|
||
|
|
"learning_rate": 0.00019333333333333333,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.1843562602996826,
|
||
|
|
"grad_norm": 0.1841956526041031,
|
||
|
|
"learning_rate": 0.0001913545457642601,
|
||
|
|
"epoch": 1.1666666666666667,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.1743373155593872,
|
||
|
|
"grad_norm": 0.12225674837827682,
|
||
|
|
"learning_rate": 0.00015877852522924732,
|
||
|
|
"epoch": 1.3333333333333333,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.1707882285118103,
|
||
|
|
"grad_norm": 0.11675203591585159,
|
||
|
|
"learning_rate": 0.00011045284632676536,
|
||
|
|
"epoch": 1.5,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.17305984497070312,
|
||
|
|
"grad_norm": 0.168966606259346,
|
||
|
|
"learning_rate": 5.9326335692419995e-05,
|
||
|
|
"epoch": 1.6666666666666665,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.1723298192024231,
|
||
|
|
"grad_norm": 0.14092567563056946,
|
||
|
|
"learning_rate": 1.9098300562505266e-05,
|
||
|
|
"epoch": 1.8333333333333335,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.16860610246658325,
|
||
|
|
"grad_norm": 0.13329552114009857,
|
||
|
|
"learning_rate": 5.478104631726711e-07,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"train_runtime": 1079.0765,
|
||
|
|
"train_samples_per_second": 1.112,
|
||
|
|
"train_steps_per_second": 0.056,
|
||
|
|
"total_flos": 5520149869086720.0,
|
||
|
|
"train_loss": 0.6150601516167323,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"step": 60
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"grpo_history": [
|
||
|
|
{
|
||
|
|
"loss": 0.00047351792454719543,
|
||
|
|
"grad_norm": 0.72265625,
|
||
|
|
"learning_rate": 8.333333333333333e-07,
|
||
|
|
"num_tokens": 13592.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.4375,
|
||
|
|
"rewards/reward_format/std": 0.3535533770918846,
|
||
|
|
"rewards/reward_welfare/mean": 0.0625,
|
||
|
|
"rewards/reward_welfare/std": 0.1767766922712326,
|
||
|
|
"rewards/reward_fairness/mean": 0.03318497911095619,
|
||
|
|
"rewards/reward_fairness/std": 0.09386129677295685,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.02344914712011814,
|
||
|
|
"rewards/reward_composite/std": 0.06632420420646667,
|
||
|
|
"reward": 0.6816341280937195,
|
||
|
|
"reward_std": 0.48826825618743896,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.47351907938718796,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.016666666666666666,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00041250139474868774,
|
||
|
|
"grad_norm": 0.68359375,
|
||
|
|
"learning_rate": 2.5e-06,
|
||
|
|
"num_tokens": 26592.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.33522727340459824,
|
||
|
|
"rewards/reward_format/std": 0.3089452385902405,
|
||
|
|
"rewards/reward_welfare/mean": 0.125,
|
||
|
|
"rewards/reward_welfare/std": 0.2314550280570984,
|
||
|
|
"rewards/reward_fairness/mean": 0.037382133305072784,
|
||
|
|
"rewards/reward_fairness/std": 0.07023922353982925,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.03359023481607437,
|
||
|
|
"rewards/reward_composite/std": 0.06231452897191048,
|
||
|
|
"reward": 0.8607450723648071,
|
||
|
|
"reward_std": 0.4173068106174469,
|
||
|
|
"frac_reward_zero_std": 0.75,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.4125128909945488,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.03333333333333333,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003954425919800997,
|
||
|
|
"grad_norm": 0.00665283203125,
|
||
|
|
"learning_rate": 4.166666666666667e-06,
|
||
|
|
"num_tokens": 39888.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.53125,
|
||
|
|
"rewards/reward_format/std": 0.0883883461356163,
|
||
|
|
"rewards/reward_welfare/mean": 0.0,
|
||
|
|
"rewards/reward_welfare/std": 0.0,
|
||
|
|
"rewards/reward_fairness/mean": 0.0,
|
||
|
|
"rewards/reward_fairness/std": 0.0,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.0,
|
||
|
|
"rewards/reward_composite/std": 0.0,
|
||
|
|
"reward": 0.46875,
|
||
|
|
"reward_std": 0.0625,
|
||
|
|
"frac_reward_zero_std": 0.75,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3954422175884247,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.05,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00040989843546412885,
|
||
|
|
"grad_norm": 0.013671875,
|
||
|
|
"learning_rate": 4.995770395678171e-06,
|
||
|
|
"num_tokens": 53776.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.5,
|
||
|
|
"rewards/reward_format/std": 0.0,
|
||
|
|
"rewards/reward_welfare/mean": 0.0,
|
||
|
|
"rewards/reward_welfare/std": 0.0,
|
||
|
|
"rewards/reward_fairness/mean": 0.0,
|
||
|
|
"rewards/reward_fairness/std": 0.0,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.0,
|
||
|
|
"rewards/reward_composite/std": 0.0,
|
||
|
|
"reward": 0.5,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"frac_reward_zero_std": 1.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.40989840030670166,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.06666666666666667,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00042488425970077515,
|
||
|
|
"grad_norm": 0.4921875,
|
||
|
|
"learning_rate": 4.962019382530521e-06,
|
||
|
|
"num_tokens": 67664.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.46875,
|
||
|
|
"rewards/reward_format/std": 0.2651650384068489,
|
||
|
|
"rewards/reward_welfare/mean": 0.0625,
|
||
|
|
"rewards/reward_welfare/std": 0.1767766922712326,
|
||
|
|
"rewards/reward_fairness/mean": 0.02351469174027443,
|
||
|
|
"rewards/reward_fairness/std": 0.06650959700345993,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.023048987612128258,
|
||
|
|
"rewards/reward_composite/std": 0.06519238650798798,
|
||
|
|
"reward": 0.6403136849403381,
|
||
|
|
"reward_std": 0.40562736988067627,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.42487896233797073,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.08333333333333333,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003492364485282451,
|
||
|
|
"grad_norm": 0.703125,
|
||
|
|
"learning_rate": 4.894973780788722e-06,
|
||
|
|
"num_tokens": 81552.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.34659090638160706,
|
||
|
|
"rewards/reward_format/std": 0.6987431943416595,
|
||
|
|
"rewards/reward_welfare/mean": 0.1875,
|
||
|
|
"rewards/reward_welfare/std": 0.408231720328331,
|
||
|
|
"rewards/reward_fairness/mean": 0.060186946764588356,
|
||
|
|
"rewards/reward_fairness/std": 0.13657810539007187,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.05276940576732159,
|
||
|
|
"rewards/reward_composite/std": 0.11823124438524246,
|
||
|
|
"reward": 0.9538654386997223,
|
||
|
|
"reward_std": 1.2448847889900208,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.34924405813217163,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.1,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00039254588773474097,
|
||
|
|
"grad_norm": 0.7890625,
|
||
|
|
"learning_rate": 4.7955402672006855e-06,
|
||
|
|
"num_tokens": 95440.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.2755681872367859,
|
||
|
|
"rewards/reward_format/std": 0.5705045461654663,
|
||
|
|
"rewards/reward_welfare/mean": 0.1875,
|
||
|
|
"rewards/reward_welfare/std": 0.408231720328331,
|
||
|
|
"rewards/reward_fairness/mean": 0.05876787751913071,
|
||
|
|
"rewards/reward_fairness/std": 0.13999952003359795,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.051001692190766335,
|
||
|
|
"rewards/reward_composite/std": 0.12243235111236572,
|
||
|
|
"reward": 1.0217013657093048,
|
||
|
|
"reward_std": 1.1684027314186096,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3925560265779495,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.11666666666666667,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00040383817395195365,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 4.665063509461098e-06,
|
||
|
|
"num_tokens": 108736.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.16477272659540176,
|
||
|
|
"rewards/reward_format/std": 0.6252594292163849,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.10286042466759682,
|
||
|
|
"rewards/reward_fairness/std": 0.16851608455181122,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.09322065114974976,
|
||
|
|
"rewards/reward_composite/std": 0.1486019790172577,
|
||
|
|
"reward": 1.343808352947235,
|
||
|
|
"reward_std": 1.4776567816734314,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.40385157614946365,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.13333333333333333,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0004179440438747406,
|
||
|
|
"grad_norm": 0.65234375,
|
||
|
|
"learning_rate": 4.50530798188761e-06,
|
||
|
|
"num_tokens": 122624.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.34375,
|
||
|
|
"rewards/reward_format/std": 0.5564062297344208,
|
||
|
|
"rewards/reward_welfare/mean": 0.125,
|
||
|
|
"rewards/reward_welfare/std": 0.3535533845424652,
|
||
|
|
"rewards/reward_fairness/mean": 0.04079132154583931,
|
||
|
|
"rewards/reward_fairness/std": 0.11537527851760387,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.035708064679056406,
|
||
|
|
"rewards/reward_composite/std": 0.10099766962230206,
|
||
|
|
"reward": 0.8577493727207184,
|
||
|
|
"reward_std": 0.8404987752437592,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.41793932020664215,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.15,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00039753690361976624,
|
||
|
|
"grad_norm": 0.75,
|
||
|
|
"learning_rate": 4.318434103932622e-06,
|
||
|
|
"num_tokens": 136512.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.35795454680919647,
|
||
|
|
"rewards/reward_format/std": 0.6613655686378479,
|
||
|
|
"rewards/reward_welfare/mean": 0.1875,
|
||
|
|
"rewards/reward_welfare/std": 0.408231720328331,
|
||
|
|
"rewards/reward_fairness/mean": 0.05811220221221447,
|
||
|
|
"rewards/reward_fairness/std": 0.1433359570801258,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.04960842803120613,
|
||
|
|
"rewards/reward_composite/std": 0.11969681829214096,
|
||
|
|
"reward": 0.9372660517692566,
|
||
|
|
"reward_std": 0.9138101935386658,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.39755555987358093,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.16666666666666666,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003871597582474351,
|
||
|
|
"grad_norm": 0.005157470703125,
|
||
|
|
"learning_rate": 4.106969024216348e-06,
|
||
|
|
"num_tokens": 150104.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.21875,
|
||
|
|
"rewards/reward_format/std": 0.38816189765930176,
|
||
|
|
"rewards/reward_welfare/mean": 0.1875,
|
||
|
|
"rewards/reward_welfare/std": 0.25877460837364197,
|
||
|
|
"rewards/reward_fairness/mean": 0.09772966802120209,
|
||
|
|
"rewards/reward_fairness/std": 0.1411271095275879,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.07592316716909409,
|
||
|
|
"rewards/reward_composite/std": 0.10568810254335403,
|
||
|
|
"reward": 1.1424028873443604,
|
||
|
|
"reward_std": 0.9167249202728271,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3871647119522095,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.18333333333333332,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00046034157276153564,
|
||
|
|
"grad_norm": 0.828125,
|
||
|
|
"learning_rate": 3.8737724451770155e-06,
|
||
|
|
"num_tokens": 163992.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.053977273404598236,
|
||
|
|
"rewards/reward_format/std": 0.6971071362495422,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.11436978727579117,
|
||
|
|
"rewards/reward_fairness/std": 0.21275469660758972,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.08883418142795563,
|
||
|
|
"rewards/reward_composite/std": 0.15858761221170425,
|
||
|
|
"reward": 1.4617266654968262,
|
||
|
|
"reward_std": 1.6014615297317505,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.46036188304424286,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.2,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00037954188883304596,
|
||
|
|
"grad_norm": 0.75390625,
|
||
|
|
"learning_rate": 3.621997950501156e-06,
|
||
|
|
"num_tokens": 177584.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.11079545319080353,
|
||
|
|
"rewards/reward_format/std": 0.7605703175067902,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.13740837946534157,
|
||
|
|
"rewards/reward_fairness/std": 0.23384775966405869,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.1134110763669014,
|
||
|
|
"rewards/reward_composite/std": 0.1934959888458252,
|
||
|
|
"reward": 1.4525240659713745,
|
||
|
|
"reward_std": 1.4633366465568542,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3795487657189369,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.21666666666666667,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00034568458795547485,
|
||
|
|
"grad_norm": 0.67578125,
|
||
|
|
"learning_rate": 3.3550503583141726e-06,
|
||
|
|
"num_tokens": 190880.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.014204561710357666,
|
||
|
|
"rewards/reward_format/std": 0.45640653371810913,
|
||
|
|
"rewards/reward_welfare/mean": 0.375,
|
||
|
|
"rewards/reward_welfare/std": 0.2314550280570984,
|
||
|
|
"rewards/reward_fairness/mean": 0.17191734910011292,
|
||
|
|
"rewards/reward_fairness/std": 0.165505051612854,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.12545911967754364,
|
||
|
|
"rewards/reward_composite/std": 0.09392639249563217,
|
||
|
|
"reward": 1.6581718921661377,
|
||
|
|
"reward_std": 0.9023097902536392,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3456726223230362,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.23333333333333334,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0004424452781677246,
|
||
|
|
"grad_norm": 0.79296875,
|
||
|
|
"learning_rate": 3.0765396768561005e-06,
|
||
|
|
"num_tokens": 204472.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.05681818723678589,
|
||
|
|
"rewards/reward_format/std": 0.8016891181468964,
|
||
|
|
"rewards/reward_welfare/mean": 0.375,
|
||
|
|
"rewards/reward_welfare/std": 0.49871626496315,
|
||
|
|
"rewards/reward_fairness/mean": 0.11990131065249443,
|
||
|
|
"rewards/reward_fairness/std": 0.19920051097869873,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.10109234228730202,
|
||
|
|
"rewards/reward_composite/std": 0.15720761567354202,
|
||
|
|
"reward": 1.5391755104064941,
|
||
|
|
"reward_std": 1.311523675918579,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.44244876503944397,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.25,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00044108927249908447,
|
||
|
|
"grad_norm": 0.79296875,
|
||
|
|
"learning_rate": 2.7902322853130758e-06,
|
||
|
|
"num_tokens": 218360.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.23011364042758942,
|
||
|
|
"rewards/reward_format/std": 0.7126934230327606,
|
||
|
|
"rewards/reward_welfare/mean": 0.25,
|
||
|
|
"rewards/reward_welfare/std": 0.4355513006448746,
|
||
|
|
"rewards/reward_fairness/mean": 0.09009831957519054,
|
||
|
|
"rewards/reward_fairness/std": 0.17519650608301163,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.07756261341273785,
|
||
|
|
"rewards/reward_composite/std": 0.1447325348854065,
|
||
|
|
"reward": 1.1875473260879517,
|
||
|
|
"reward_std": 1.3125466108322144,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.4411006420850754,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.26666666666666666,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00043725594878196716,
|
||
|
|
"grad_norm": 0.765625,
|
||
|
|
"learning_rate": 2.5e-06,
|
||
|
|
"num_tokens": 232248.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": 0.3238636404275894,
|
||
|
|
"rewards/reward_format/std": 0.7694187164306641,
|
||
|
|
"rewards/reward_welfare/mean": 0.5625,
|
||
|
|
"rewards/reward_welfare/std": 0.5260358452796936,
|
||
|
|
"rewards/reward_fairness/mean": 0.19406583905220032,
|
||
|
|
"rewards/reward_fairness/std": 0.20889797061681747,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.1601117104291916,
|
||
|
|
"rewards/reward_composite/std": 0.16504594683647156,
|
||
|
|
"reward": 2.2405412197113037,
|
||
|
|
"reward_std": 1.72732412815094,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.43723437190055847,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.2833333333333333,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003281831741333008,
|
||
|
|
"grad_norm": 0.66015625,
|
||
|
|
"learning_rate": 2.2097677146869242e-06,
|
||
|
|
"num_tokens": 245840.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.0625,
|
||
|
|
"rewards/reward_format/std": 0.8398386240005493,
|
||
|
|
"rewards/reward_welfare/mean": 0.375,
|
||
|
|
"rewards/reward_welfare/std": 0.5175492167472839,
|
||
|
|
"rewards/reward_fairness/mean": 0.208244688808918,
|
||
|
|
"rewards/reward_fairness/std": 0.3558191955089569,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.12120818346738815,
|
||
|
|
"rewards/reward_composite/std": 0.17605619877576828,
|
||
|
|
"reward": 1.6419528722763062,
|
||
|
|
"reward_std": 1.9059234857559204,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.32817772775888443,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.3,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003703221445903182,
|
||
|
|
"grad_norm": 0.65625,
|
||
|
|
"learning_rate": 1.9234603231439e-06,
|
||
|
|
"num_tokens": 258840.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.34375,
|
||
|
|
"rewards/reward_format/std": 0.4355708882212639,
|
||
|
|
"rewards/reward_welfare/mean": 0.125,
|
||
|
|
"rewards/reward_welfare/std": 0.2314550280570984,
|
||
|
|
"rewards/reward_fairness/mean": 0.032129574567079544,
|
||
|
|
"rewards/reward_fairness/std": 0.06755802780389786,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.0333767905831337,
|
||
|
|
"rewards/reward_composite/std": 0.07086637616157532,
|
||
|
|
"reward": 0.8467563986778259,
|
||
|
|
"reward_std": 0.8185127973556519,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3703107312321663,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.31666666666666665,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003897678107023239,
|
||
|
|
"grad_norm": 0.76171875,
|
||
|
|
"learning_rate": 1.6449496416858285e-06,
|
||
|
|
"num_tokens": 272136.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": 0.07102272659540176,
|
||
|
|
"rewards/reward_format/std": 0.8448813557624817,
|
||
|
|
"rewards/reward_welfare/mean": 0.4375,
|
||
|
|
"rewards/reward_welfare/std": 0.5260358452796936,
|
||
|
|
"rewards/reward_fairness/mean": 0.13828522339463234,
|
||
|
|
"rewards/reward_fairness/std": 0.19835777580738068,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.11603889241814613,
|
||
|
|
"rewards/reward_composite/std": 0.15580761432647705,
|
||
|
|
"reward": 1.762846827507019,
|
||
|
|
"reward_std": 1.750555157661438,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.38978311419487,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.3333333333333333,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00036280229687690735,
|
||
|
|
"grad_norm": 0.52734375,
|
||
|
|
"learning_rate": 1.3780020494988447e-06,
|
||
|
|
"num_tokens": 286024.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.21875,
|
||
|
|
"rewards/reward_format/std": 0.7038120031356812,
|
||
|
|
"rewards/reward_welfare/mean": 0.25,
|
||
|
|
"rewards/reward_welfare/std": 0.4629100561141968,
|
||
|
|
"rewards/reward_fairness/mean": 0.08049380034208298,
|
||
|
|
"rewards/reward_fairness/std": 0.14985806494951248,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.06840312853455544,
|
||
|
|
"rewards/reward_composite/std": 0.12792598456144333,
|
||
|
|
"reward": 1.1801469326019287,
|
||
|
|
"reward_std": 1.2528201341629028,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.36278442293405533,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.35,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00041984766721725464,
|
||
|
|
"grad_norm": 0.734375,
|
||
|
|
"learning_rate": 1.1262275548229852e-06,
|
||
|
|
"num_tokens": 299320.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.125,
|
||
|
|
"rewards/reward_format/std": 0.7535476386547089,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.44403792917728424,
|
||
|
|
"rewards/reward_fairness/mean": 0.08010485023260117,
|
||
|
|
"rewards/reward_fairness/std": 0.13119615614414215,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.06627386063337326,
|
||
|
|
"rewards/reward_composite/std": 0.0994122326374054,
|
||
|
|
"reward": 1.333878755569458,
|
||
|
|
"reward_std": 1.2175767719745636,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.4198339805006981,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.36666666666666664,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00037025846540927887,
|
||
|
|
"grad_norm": 0.74609375,
|
||
|
|
"learning_rate": 8.930309757836517e-07,
|
||
|
|
"num_tokens": 312912.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.15625,
|
||
|
|
"rewards/reward_format/std": 0.7469770908355713,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.44403792917728424,
|
||
|
|
"rewards/reward_fairness/mean": 0.0987030416727066,
|
||
|
|
"rewards/reward_fairness/std": 0.13824082165956497,
|
||
|
|
"rewards/reward_stability/mean": 0.9375,
|
||
|
|
"rewards/reward_stability/std": 0.1767766922712326,
|
||
|
|
"rewards/reward_composite/mean": 0.0686455499380827,
|
||
|
|
"rewards/reward_composite/std": 0.11963466554880142,
|
||
|
|
"reward": 1.2610985934734344,
|
||
|
|
"reward_std": 1.1380138397216797,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3702595606446266,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.38333333333333336,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00037697795778512955,
|
||
|
|
"grad_norm": 0.68359375,
|
||
|
|
"learning_rate": 6.815658960673782e-07,
|
||
|
|
"num_tokens": 326208.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": 0.07954545319080353,
|
||
|
|
"rewards/reward_format/std": 0.7752929627895355,
|
||
|
|
"rewards/reward_welfare/mean": 0.4375,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.1642819568514824,
|
||
|
|
"rewards/reward_fairness/std": 0.2405308187007904,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.12362180650234222,
|
||
|
|
"rewards/reward_composite/std": 0.16839426010847092,
|
||
|
|
"reward": 1.8049492835998535,
|
||
|
|
"reward_std": 1.6678152084350586,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3769652917981148,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.4,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00039646029472351074,
|
||
|
|
"grad_norm": 0.6953125,
|
||
|
|
"learning_rate": 4.946920181123904e-07,
|
||
|
|
"num_tokens": 339800.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.06818181276321411,
|
||
|
|
"rewards/reward_format/std": 0.7570639848709106,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.12952633947134018,
|
||
|
|
"rewards/reward_fairness/std": 0.2085839882493019,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.10791970416903496,
|
||
|
|
"rewards/reward_composite/std": 0.1719568744301796,
|
||
|
|
"reward": 1.4817642569541931,
|
||
|
|
"reward_std": 1.4108701944351196,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.39645931124687195,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.4166666666666667,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.000441722571849823,
|
||
|
|
"grad_norm": 0.7265625,
|
||
|
|
"learning_rate": 3.3493649053890325e-07,
|
||
|
|
"num_tokens": 353688.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": 0.03125,
|
||
|
|
"rewards/reward_format/std": 0.8107390403747559,
|
||
|
|
"rewards/reward_welfare/mean": 0.375,
|
||
|
|
"rewards/reward_welfare/std": 0.5175492167472839,
|
||
|
|
"rewards/reward_fairness/mean": 0.10219378396868706,
|
||
|
|
"rewards/reward_fairness/std": 0.1520508974790573,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.09825712814927101,
|
||
|
|
"rewards/reward_composite/std": 0.1404884159564972,
|
||
|
|
"reward": 1.6067009568214417,
|
||
|
|
"reward_std": 1.2484731674194336,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.44170165807008743,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.43333333333333335,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00037848297506570816,
|
||
|
|
"grad_norm": 0.796875,
|
||
|
|
"learning_rate": 2.044597327993153e-07,
|
||
|
|
"num_tokens": 367280.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": 0.23579545319080353,
|
||
|
|
"rewards/reward_format/std": 0.7612137198448181,
|
||
|
|
"rewards/reward_welfare/mean": 0.5625,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.2322249710559845,
|
||
|
|
"rewards/reward_fairness/std": 0.20398348569869995,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.19660773873329163,
|
||
|
|
"rewards/reward_composite/std": 0.17612425237894058,
|
||
|
|
"reward": 2.2271281480789185,
|
||
|
|
"reward_std": 1.4199119210243225,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3786723464727402,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.45,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00040780752897262573,
|
||
|
|
"grad_norm": 0.65234375,
|
||
|
|
"learning_rate": 1.0502621921127776e-07,
|
||
|
|
"num_tokens": 380872.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.0625,
|
||
|
|
"rewards/reward_format/std": 0.7646470665931702,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.09930047020316124,
|
||
|
|
"rewards/reward_fairness/std": 0.18247877806425095,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.08683200180530548,
|
||
|
|
"rewards/reward_composite/std": 0.15039421617984772,
|
||
|
|
"reward": 1.4361324906349182,
|
||
|
|
"reward_std": 1.6369856595993042,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.4077882617712021,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.4666666666666667,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00038760900497436523,
|
||
|
|
"grad_norm": 0.74609375,
|
||
|
|
"learning_rate": 3.798061746947995e-08,
|
||
|
|
"num_tokens": 394168.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.002840910106897354,
|
||
|
|
"rewards/reward_format/std": 0.8362354636192322,
|
||
|
|
"rewards/reward_welfare/mean": 0.375,
|
||
|
|
"rewards/reward_welfare/std": 0.5175492167472839,
|
||
|
|
"rewards/reward_fairness/mean": 0.11489119380712509,
|
||
|
|
"rewards/reward_fairness/std": 0.17957812547683716,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.11398349702358246,
|
||
|
|
"rewards/reward_composite/std": 0.17128486931324005,
|
||
|
|
"reward": 1.6010336875915527,
|
||
|
|
"reward_std": 1.725760817527771,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3876567706465721,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.48333333333333334,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00037222355604171753,
|
||
|
|
"grad_norm": 0.703125,
|
||
|
|
"learning_rate": 4.229604321829561e-09,
|
||
|
|
"num_tokens": 407760.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.46875,
|
||
|
|
"rewards/reward_format/std": 0.6302918791770935,
|
||
|
|
"rewards/reward_welfare/mean": 0.125,
|
||
|
|
"rewards/reward_welfare/std": 0.3535533845424652,
|
||
|
|
"rewards/reward_fairness/mean": 0.015625,
|
||
|
|
"rewards/reward_fairness/std": 0.04419417306780815,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.018206155858933926,
|
||
|
|
"rewards/reward_composite/std": 0.05149478651583195,
|
||
|
|
"reward": 0.6900811493396759,
|
||
|
|
"reward_std": 0.873493492603302,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3722131997346878,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.5,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"train_runtime": 1880.1016,
|
||
|
|
"train_samples_per_second": 0.255,
|
||
|
|
"train_steps_per_second": 0.032,
|
||
|
|
"total_flos": 0.0,
|
||
|
|
"train_loss": 0.0003991109939912955,
|
||
|
|
"epoch": 0.5,
|
||
|
|
"step": 60
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"history": [
|
||
|
|
{
|
||
|
|
"loss": 1.8329996109008788,
|
||
|
|
"grad_norm": 2.6284756660461426,
|
||
|
|
"learning_rate": 2.6666666666666667e-05,
|
||
|
|
"epoch": 0.16666666666666666,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.641743278503418,
|
||
|
|
"grad_norm": 0.9074174761772156,
|
||
|
|
"learning_rate": 6e-05,
|
||
|
|
"epoch": 0.3333333333333333,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 1.3325251579284667,
|
||
|
|
"grad_norm": 0.772527277469635,
|
||
|
|
"learning_rate": 9.333333333333334e-05,
|
||
|
|
"epoch": 0.5,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.908332347869873,
|
||
|
|
"grad_norm": 0.8558230400085449,
|
||
|
|
"learning_rate": 0.00012666666666666666,
|
||
|
|
"epoch": 0.6666666666666666,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.4191232204437256,
|
||
|
|
"grad_norm": 0.6383947134017944,
|
||
|
|
"learning_rate": 0.00016,
|
||
|
|
"epoch": 0.8333333333333334,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.20252063274383544,
|
||
|
|
"grad_norm": 0.24536560475826263,
|
||
|
|
"learning_rate": 0.00019333333333333333,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.1843562602996826,
|
||
|
|
"grad_norm": 0.1841956526041031,
|
||
|
|
"learning_rate": 0.0001913545457642601,
|
||
|
|
"epoch": 1.1666666666666667,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.1743373155593872,
|
||
|
|
"grad_norm": 0.12225674837827682,
|
||
|
|
"learning_rate": 0.00015877852522924732,
|
||
|
|
"epoch": 1.3333333333333333,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.1707882285118103,
|
||
|
|
"grad_norm": 0.11675203591585159,
|
||
|
|
"learning_rate": 0.00011045284632676536,
|
||
|
|
"epoch": 1.5,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.17305984497070312,
|
||
|
|
"grad_norm": 0.168966606259346,
|
||
|
|
"learning_rate": 5.9326335692419995e-05,
|
||
|
|
"epoch": 1.6666666666666665,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.1723298192024231,
|
||
|
|
"grad_norm": 0.14092567563056946,
|
||
|
|
"learning_rate": 1.9098300562505266e-05,
|
||
|
|
"epoch": 1.8333333333333335,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.16860610246658325,
|
||
|
|
"grad_norm": 0.13329552114009857,
|
||
|
|
"learning_rate": 5.478104631726711e-07,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"train_runtime": 1079.0765,
|
||
|
|
"train_samples_per_second": 1.112,
|
||
|
|
"train_steps_per_second": 0.056,
|
||
|
|
"total_flos": 5520149869086720.0,
|
||
|
|
"train_loss": 0.6150601516167323,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00047351792454719543,
|
||
|
|
"grad_norm": 0.72265625,
|
||
|
|
"learning_rate": 8.333333333333333e-07,
|
||
|
|
"num_tokens": 13592.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.4375,
|
||
|
|
"rewards/reward_format/std": 0.3535533770918846,
|
||
|
|
"rewards/reward_welfare/mean": 0.0625,
|
||
|
|
"rewards/reward_welfare/std": 0.1767766922712326,
|
||
|
|
"rewards/reward_fairness/mean": 0.03318497911095619,
|
||
|
|
"rewards/reward_fairness/std": 0.09386129677295685,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.02344914712011814,
|
||
|
|
"rewards/reward_composite/std": 0.06632420420646667,
|
||
|
|
"reward": 0.6816341280937195,
|
||
|
|
"reward_std": 0.48826825618743896,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.47351907938718796,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.016666666666666666,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00041250139474868774,
|
||
|
|
"grad_norm": 0.68359375,
|
||
|
|
"learning_rate": 2.5e-06,
|
||
|
|
"num_tokens": 26592.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.33522727340459824,
|
||
|
|
"rewards/reward_format/std": 0.3089452385902405,
|
||
|
|
"rewards/reward_welfare/mean": 0.125,
|
||
|
|
"rewards/reward_welfare/std": 0.2314550280570984,
|
||
|
|
"rewards/reward_fairness/mean": 0.037382133305072784,
|
||
|
|
"rewards/reward_fairness/std": 0.07023922353982925,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.03359023481607437,
|
||
|
|
"rewards/reward_composite/std": 0.06231452897191048,
|
||
|
|
"reward": 0.8607450723648071,
|
||
|
|
"reward_std": 0.4173068106174469,
|
||
|
|
"frac_reward_zero_std": 0.75,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.4125128909945488,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.03333333333333333,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003954425919800997,
|
||
|
|
"grad_norm": 0.00665283203125,
|
||
|
|
"learning_rate": 4.166666666666667e-06,
|
||
|
|
"num_tokens": 39888.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.53125,
|
||
|
|
"rewards/reward_format/std": 0.0883883461356163,
|
||
|
|
"rewards/reward_welfare/mean": 0.0,
|
||
|
|
"rewards/reward_welfare/std": 0.0,
|
||
|
|
"rewards/reward_fairness/mean": 0.0,
|
||
|
|
"rewards/reward_fairness/std": 0.0,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.0,
|
||
|
|
"rewards/reward_composite/std": 0.0,
|
||
|
|
"reward": 0.46875,
|
||
|
|
"reward_std": 0.0625,
|
||
|
|
"frac_reward_zero_std": 0.75,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3954422175884247,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.05,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00040989843546412885,
|
||
|
|
"grad_norm": 0.013671875,
|
||
|
|
"learning_rate": 4.995770395678171e-06,
|
||
|
|
"num_tokens": 53776.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.5,
|
||
|
|
"rewards/reward_format/std": 0.0,
|
||
|
|
"rewards/reward_welfare/mean": 0.0,
|
||
|
|
"rewards/reward_welfare/std": 0.0,
|
||
|
|
"rewards/reward_fairness/mean": 0.0,
|
||
|
|
"rewards/reward_fairness/std": 0.0,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.0,
|
||
|
|
"rewards/reward_composite/std": 0.0,
|
||
|
|
"reward": 0.5,
|
||
|
|
"reward_std": 0.0,
|
||
|
|
"frac_reward_zero_std": 1.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.40989840030670166,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.06666666666666667,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00042488425970077515,
|
||
|
|
"grad_norm": 0.4921875,
|
||
|
|
"learning_rate": 4.962019382530521e-06,
|
||
|
|
"num_tokens": 67664.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.46875,
|
||
|
|
"rewards/reward_format/std": 0.2651650384068489,
|
||
|
|
"rewards/reward_welfare/mean": 0.0625,
|
||
|
|
"rewards/reward_welfare/std": 0.1767766922712326,
|
||
|
|
"rewards/reward_fairness/mean": 0.02351469174027443,
|
||
|
|
"rewards/reward_fairness/std": 0.06650959700345993,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.023048987612128258,
|
||
|
|
"rewards/reward_composite/std": 0.06519238650798798,
|
||
|
|
"reward": 0.6403136849403381,
|
||
|
|
"reward_std": 0.40562736988067627,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.42487896233797073,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.08333333333333333,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003492364485282451,
|
||
|
|
"grad_norm": 0.703125,
|
||
|
|
"learning_rate": 4.894973780788722e-06,
|
||
|
|
"num_tokens": 81552.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.34659090638160706,
|
||
|
|
"rewards/reward_format/std": 0.6987431943416595,
|
||
|
|
"rewards/reward_welfare/mean": 0.1875,
|
||
|
|
"rewards/reward_welfare/std": 0.408231720328331,
|
||
|
|
"rewards/reward_fairness/mean": 0.060186946764588356,
|
||
|
|
"rewards/reward_fairness/std": 0.13657810539007187,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.05276940576732159,
|
||
|
|
"rewards/reward_composite/std": 0.11823124438524246,
|
||
|
|
"reward": 0.9538654386997223,
|
||
|
|
"reward_std": 1.2448847889900208,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.34924405813217163,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.1,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00039254588773474097,
|
||
|
|
"grad_norm": 0.7890625,
|
||
|
|
"learning_rate": 4.7955402672006855e-06,
|
||
|
|
"num_tokens": 95440.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.2755681872367859,
|
||
|
|
"rewards/reward_format/std": 0.5705045461654663,
|
||
|
|
"rewards/reward_welfare/mean": 0.1875,
|
||
|
|
"rewards/reward_welfare/std": 0.408231720328331,
|
||
|
|
"rewards/reward_fairness/mean": 0.05876787751913071,
|
||
|
|
"rewards/reward_fairness/std": 0.13999952003359795,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.051001692190766335,
|
||
|
|
"rewards/reward_composite/std": 0.12243235111236572,
|
||
|
|
"reward": 1.0217013657093048,
|
||
|
|
"reward_std": 1.1684027314186096,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3925560265779495,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.11666666666666667,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00040383817395195365,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 4.665063509461098e-06,
|
||
|
|
"num_tokens": 108736.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.16477272659540176,
|
||
|
|
"rewards/reward_format/std": 0.6252594292163849,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.10286042466759682,
|
||
|
|
"rewards/reward_fairness/std": 0.16851608455181122,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.09322065114974976,
|
||
|
|
"rewards/reward_composite/std": 0.1486019790172577,
|
||
|
|
"reward": 1.343808352947235,
|
||
|
|
"reward_std": 1.4776567816734314,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.40385157614946365,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.13333333333333333,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0004179440438747406,
|
||
|
|
"grad_norm": 0.65234375,
|
||
|
|
"learning_rate": 4.50530798188761e-06,
|
||
|
|
"num_tokens": 122624.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.34375,
|
||
|
|
"rewards/reward_format/std": 0.5564062297344208,
|
||
|
|
"rewards/reward_welfare/mean": 0.125,
|
||
|
|
"rewards/reward_welfare/std": 0.3535533845424652,
|
||
|
|
"rewards/reward_fairness/mean": 0.04079132154583931,
|
||
|
|
"rewards/reward_fairness/std": 0.11537527851760387,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.035708064679056406,
|
||
|
|
"rewards/reward_composite/std": 0.10099766962230206,
|
||
|
|
"reward": 0.8577493727207184,
|
||
|
|
"reward_std": 0.8404987752437592,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.41793932020664215,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.15,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00039753690361976624,
|
||
|
|
"grad_norm": 0.75,
|
||
|
|
"learning_rate": 4.318434103932622e-06,
|
||
|
|
"num_tokens": 136512.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.35795454680919647,
|
||
|
|
"rewards/reward_format/std": 0.6613655686378479,
|
||
|
|
"rewards/reward_welfare/mean": 0.1875,
|
||
|
|
"rewards/reward_welfare/std": 0.408231720328331,
|
||
|
|
"rewards/reward_fairness/mean": 0.05811220221221447,
|
||
|
|
"rewards/reward_fairness/std": 0.1433359570801258,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.04960842803120613,
|
||
|
|
"rewards/reward_composite/std": 0.11969681829214096,
|
||
|
|
"reward": 0.9372660517692566,
|
||
|
|
"reward_std": 0.9138101935386658,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.39755555987358093,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.16666666666666666,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003871597582474351,
|
||
|
|
"grad_norm": 0.005157470703125,
|
||
|
|
"learning_rate": 4.106969024216348e-06,
|
||
|
|
"num_tokens": 150104.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.21875,
|
||
|
|
"rewards/reward_format/std": 0.38816189765930176,
|
||
|
|
"rewards/reward_welfare/mean": 0.1875,
|
||
|
|
"rewards/reward_welfare/std": 0.25877460837364197,
|
||
|
|
"rewards/reward_fairness/mean": 0.09772966802120209,
|
||
|
|
"rewards/reward_fairness/std": 0.1411271095275879,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.07592316716909409,
|
||
|
|
"rewards/reward_composite/std": 0.10568810254335403,
|
||
|
|
"reward": 1.1424028873443604,
|
||
|
|
"reward_std": 0.9167249202728271,
|
||
|
|
"frac_reward_zero_std": 0.5,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3871647119522095,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.18333333333333332,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00046034157276153564,
|
||
|
|
"grad_norm": 0.828125,
|
||
|
|
"learning_rate": 3.8737724451770155e-06,
|
||
|
|
"num_tokens": 163992.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.053977273404598236,
|
||
|
|
"rewards/reward_format/std": 0.6971071362495422,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.11436978727579117,
|
||
|
|
"rewards/reward_fairness/std": 0.21275469660758972,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.08883418142795563,
|
||
|
|
"rewards/reward_composite/std": 0.15858761221170425,
|
||
|
|
"reward": 1.4617266654968262,
|
||
|
|
"reward_std": 1.6014615297317505,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.46036188304424286,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.2,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00037954188883304596,
|
||
|
|
"grad_norm": 0.75390625,
|
||
|
|
"learning_rate": 3.621997950501156e-06,
|
||
|
|
"num_tokens": 177584.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.11079545319080353,
|
||
|
|
"rewards/reward_format/std": 0.7605703175067902,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.13740837946534157,
|
||
|
|
"rewards/reward_fairness/std": 0.23384775966405869,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.1134110763669014,
|
||
|
|
"rewards/reward_composite/std": 0.1934959888458252,
|
||
|
|
"reward": 1.4525240659713745,
|
||
|
|
"reward_std": 1.4633366465568542,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3795487657189369,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.21666666666666667,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00034568458795547485,
|
||
|
|
"grad_norm": 0.67578125,
|
||
|
|
"learning_rate": 3.3550503583141726e-06,
|
||
|
|
"num_tokens": 190880.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.014204561710357666,
|
||
|
|
"rewards/reward_format/std": 0.45640653371810913,
|
||
|
|
"rewards/reward_welfare/mean": 0.375,
|
||
|
|
"rewards/reward_welfare/std": 0.2314550280570984,
|
||
|
|
"rewards/reward_fairness/mean": 0.17191734910011292,
|
||
|
|
"rewards/reward_fairness/std": 0.165505051612854,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.12545911967754364,
|
||
|
|
"rewards/reward_composite/std": 0.09392639249563217,
|
||
|
|
"reward": 1.6581718921661377,
|
||
|
|
"reward_std": 0.9023097902536392,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3456726223230362,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.23333333333333334,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0004424452781677246,
|
||
|
|
"grad_norm": 0.79296875,
|
||
|
|
"learning_rate": 3.0765396768561005e-06,
|
||
|
|
"num_tokens": 204472.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.05681818723678589,
|
||
|
|
"rewards/reward_format/std": 0.8016891181468964,
|
||
|
|
"rewards/reward_welfare/mean": 0.375,
|
||
|
|
"rewards/reward_welfare/std": 0.49871626496315,
|
||
|
|
"rewards/reward_fairness/mean": 0.11990131065249443,
|
||
|
|
"rewards/reward_fairness/std": 0.19920051097869873,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.10109234228730202,
|
||
|
|
"rewards/reward_composite/std": 0.15720761567354202,
|
||
|
|
"reward": 1.5391755104064941,
|
||
|
|
"reward_std": 1.311523675918579,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.44244876503944397,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.25,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00044108927249908447,
|
||
|
|
"grad_norm": 0.79296875,
|
||
|
|
"learning_rate": 2.7902322853130758e-06,
|
||
|
|
"num_tokens": 218360.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.23011364042758942,
|
||
|
|
"rewards/reward_format/std": 0.7126934230327606,
|
||
|
|
"rewards/reward_welfare/mean": 0.25,
|
||
|
|
"rewards/reward_welfare/std": 0.4355513006448746,
|
||
|
|
"rewards/reward_fairness/mean": 0.09009831957519054,
|
||
|
|
"rewards/reward_fairness/std": 0.17519650608301163,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.07756261341273785,
|
||
|
|
"rewards/reward_composite/std": 0.1447325348854065,
|
||
|
|
"reward": 1.1875473260879517,
|
||
|
|
"reward_std": 1.3125466108322144,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.4411006420850754,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.26666666666666666,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00043725594878196716,
|
||
|
|
"grad_norm": 0.765625,
|
||
|
|
"learning_rate": 2.5e-06,
|
||
|
|
"num_tokens": 232248.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": 0.3238636404275894,
|
||
|
|
"rewards/reward_format/std": 0.7694187164306641,
|
||
|
|
"rewards/reward_welfare/mean": 0.5625,
|
||
|
|
"rewards/reward_welfare/std": 0.5260358452796936,
|
||
|
|
"rewards/reward_fairness/mean": 0.19406583905220032,
|
||
|
|
"rewards/reward_fairness/std": 0.20889797061681747,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.1601117104291916,
|
||
|
|
"rewards/reward_composite/std": 0.16504594683647156,
|
||
|
|
"reward": 2.2405412197113037,
|
||
|
|
"reward_std": 1.72732412815094,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.43723437190055847,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.2833333333333333,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003281831741333008,
|
||
|
|
"grad_norm": 0.66015625,
|
||
|
|
"learning_rate": 2.2097677146869242e-06,
|
||
|
|
"num_tokens": 245840.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.0625,
|
||
|
|
"rewards/reward_format/std": 0.8398386240005493,
|
||
|
|
"rewards/reward_welfare/mean": 0.375,
|
||
|
|
"rewards/reward_welfare/std": 0.5175492167472839,
|
||
|
|
"rewards/reward_fairness/mean": 0.208244688808918,
|
||
|
|
"rewards/reward_fairness/std": 0.3558191955089569,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.12120818346738815,
|
||
|
|
"rewards/reward_composite/std": 0.17605619877576828,
|
||
|
|
"reward": 1.6419528722763062,
|
||
|
|
"reward_std": 1.9059234857559204,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.32817772775888443,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.3,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003703221445903182,
|
||
|
|
"grad_norm": 0.65625,
|
||
|
|
"learning_rate": 1.9234603231439e-06,
|
||
|
|
"num_tokens": 258840.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.34375,
|
||
|
|
"rewards/reward_format/std": 0.4355708882212639,
|
||
|
|
"rewards/reward_welfare/mean": 0.125,
|
||
|
|
"rewards/reward_welfare/std": 0.2314550280570984,
|
||
|
|
"rewards/reward_fairness/mean": 0.032129574567079544,
|
||
|
|
"rewards/reward_fairness/std": 0.06755802780389786,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.0333767905831337,
|
||
|
|
"rewards/reward_composite/std": 0.07086637616157532,
|
||
|
|
"reward": 0.8467563986778259,
|
||
|
|
"reward_std": 0.8185127973556519,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3703107312321663,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.31666666666666665,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.0003897678107023239,
|
||
|
|
"grad_norm": 0.76171875,
|
||
|
|
"learning_rate": 1.6449496416858285e-06,
|
||
|
|
"num_tokens": 272136.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": 0.07102272659540176,
|
||
|
|
"rewards/reward_format/std": 0.8448813557624817,
|
||
|
|
"rewards/reward_welfare/mean": 0.4375,
|
||
|
|
"rewards/reward_welfare/std": 0.5260358452796936,
|
||
|
|
"rewards/reward_fairness/mean": 0.13828522339463234,
|
||
|
|
"rewards/reward_fairness/std": 0.19835777580738068,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.11603889241814613,
|
||
|
|
"rewards/reward_composite/std": 0.15580761432647705,
|
||
|
|
"reward": 1.762846827507019,
|
||
|
|
"reward_std": 1.750555157661438,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.38978311419487,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.3333333333333333,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00036280229687690735,
|
||
|
|
"grad_norm": 0.52734375,
|
||
|
|
"learning_rate": 1.3780020494988447e-06,
|
||
|
|
"num_tokens": 286024.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.21875,
|
||
|
|
"rewards/reward_format/std": 0.7038120031356812,
|
||
|
|
"rewards/reward_welfare/mean": 0.25,
|
||
|
|
"rewards/reward_welfare/std": 0.4629100561141968,
|
||
|
|
"rewards/reward_fairness/mean": 0.08049380034208298,
|
||
|
|
"rewards/reward_fairness/std": 0.14985806494951248,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.06840312853455544,
|
||
|
|
"rewards/reward_composite/std": 0.12792598456144333,
|
||
|
|
"reward": 1.1801469326019287,
|
||
|
|
"reward_std": 1.2528201341629028,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.36278442293405533,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.35,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00041984766721725464,
|
||
|
|
"grad_norm": 0.734375,
|
||
|
|
"learning_rate": 1.1262275548229852e-06,
|
||
|
|
"num_tokens": 299320.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.125,
|
||
|
|
"rewards/reward_format/std": 0.7535476386547089,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.44403792917728424,
|
||
|
|
"rewards/reward_fairness/mean": 0.08010485023260117,
|
||
|
|
"rewards/reward_fairness/std": 0.13119615614414215,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.06627386063337326,
|
||
|
|
"rewards/reward_composite/std": 0.0994122326374054,
|
||
|
|
"reward": 1.333878755569458,
|
||
|
|
"reward_std": 1.2175767719745636,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.4198339805006981,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.36666666666666664,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00037025846540927887,
|
||
|
|
"grad_norm": 0.74609375,
|
||
|
|
"learning_rate": 8.930309757836517e-07,
|
||
|
|
"num_tokens": 312912.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.15625,
|
||
|
|
"rewards/reward_format/std": 0.7469770908355713,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.44403792917728424,
|
||
|
|
"rewards/reward_fairness/mean": 0.0987030416727066,
|
||
|
|
"rewards/reward_fairness/std": 0.13824082165956497,
|
||
|
|
"rewards/reward_stability/mean": 0.9375,
|
||
|
|
"rewards/reward_stability/std": 0.1767766922712326,
|
||
|
|
"rewards/reward_composite/mean": 0.0686455499380827,
|
||
|
|
"rewards/reward_composite/std": 0.11963466554880142,
|
||
|
|
"reward": 1.2610985934734344,
|
||
|
|
"reward_std": 1.1380138397216797,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3702595606446266,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.38333333333333336,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00037697795778512955,
|
||
|
|
"grad_norm": 0.68359375,
|
||
|
|
"learning_rate": 6.815658960673782e-07,
|
||
|
|
"num_tokens": 326208.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": 0.07954545319080353,
|
||
|
|
"rewards/reward_format/std": 0.7752929627895355,
|
||
|
|
"rewards/reward_welfare/mean": 0.4375,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.1642819568514824,
|
||
|
|
"rewards/reward_fairness/std": 0.2405308187007904,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.12362180650234222,
|
||
|
|
"rewards/reward_composite/std": 0.16839426010847092,
|
||
|
|
"reward": 1.8049492835998535,
|
||
|
|
"reward_std": 1.6678152084350586,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3769652917981148,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.4,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00039646029472351074,
|
||
|
|
"grad_norm": 0.6953125,
|
||
|
|
"learning_rate": 4.946920181123904e-07,
|
||
|
|
"num_tokens": 339800.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.06818181276321411,
|
||
|
|
"rewards/reward_format/std": 0.7570639848709106,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.12952633947134018,
|
||
|
|
"rewards/reward_fairness/std": 0.2085839882493019,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.10791970416903496,
|
||
|
|
"rewards/reward_composite/std": 0.1719568744301796,
|
||
|
|
"reward": 1.4817642569541931,
|
||
|
|
"reward_std": 1.4108701944351196,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.39645931124687195,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.4166666666666667,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.000441722571849823,
|
||
|
|
"grad_norm": 0.7265625,
|
||
|
|
"learning_rate": 3.3493649053890325e-07,
|
||
|
|
"num_tokens": 353688.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": 0.03125,
|
||
|
|
"rewards/reward_format/std": 0.8107390403747559,
|
||
|
|
"rewards/reward_welfare/mean": 0.375,
|
||
|
|
"rewards/reward_welfare/std": 0.5175492167472839,
|
||
|
|
"rewards/reward_fairness/mean": 0.10219378396868706,
|
||
|
|
"rewards/reward_fairness/std": 0.1520508974790573,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.09825712814927101,
|
||
|
|
"rewards/reward_composite/std": 0.1404884159564972,
|
||
|
|
"reward": 1.6067009568214417,
|
||
|
|
"reward_std": 1.2484731674194336,
|
||
|
|
"frac_reward_zero_std": 0.25,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.44170165807008743,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.43333333333333335,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00037848297506570816,
|
||
|
|
"grad_norm": 0.796875,
|
||
|
|
"learning_rate": 2.044597327993153e-07,
|
||
|
|
"num_tokens": 367280.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": 0.23579545319080353,
|
||
|
|
"rewards/reward_format/std": 0.7612137198448181,
|
||
|
|
"rewards/reward_welfare/mean": 0.5625,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.2322249710559845,
|
||
|
|
"rewards/reward_fairness/std": 0.20398348569869995,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.19660773873329163,
|
||
|
|
"rewards/reward_composite/std": 0.17612425237894058,
|
||
|
|
"reward": 2.2271281480789185,
|
||
|
|
"reward_std": 1.4199119210243225,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3786723464727402,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.45,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00040780752897262573,
|
||
|
|
"grad_norm": 0.65234375,
|
||
|
|
"learning_rate": 1.0502621921127776e-07,
|
||
|
|
"num_tokens": 380872.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.0625,
|
||
|
|
"rewards/reward_format/std": 0.7646470665931702,
|
||
|
|
"rewards/reward_welfare/mean": 0.3125,
|
||
|
|
"rewards/reward_welfare/std": 0.49022963643074036,
|
||
|
|
"rewards/reward_fairness/mean": 0.09930047020316124,
|
||
|
|
"rewards/reward_fairness/std": 0.18247877806425095,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.08683200180530548,
|
||
|
|
"rewards/reward_composite/std": 0.15039421617984772,
|
||
|
|
"reward": 1.4361324906349182,
|
||
|
|
"reward_std": 1.6369856595993042,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.4077882617712021,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.4666666666666667,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00038760900497436523,
|
||
|
|
"grad_norm": 0.74609375,
|
||
|
|
"learning_rate": 3.798061746947995e-08,
|
||
|
|
"num_tokens": 394168.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.002840910106897354,
|
||
|
|
"rewards/reward_format/std": 0.8362354636192322,
|
||
|
|
"rewards/reward_welfare/mean": 0.375,
|
||
|
|
"rewards/reward_welfare/std": 0.5175492167472839,
|
||
|
|
"rewards/reward_fairness/mean": 0.11489119380712509,
|
||
|
|
"rewards/reward_fairness/std": 0.17957812547683716,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.11398349702358246,
|
||
|
|
"rewards/reward_composite/std": 0.17128486931324005,
|
||
|
|
"reward": 1.6010336875915527,
|
||
|
|
"reward_std": 1.725760817527771,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3876567706465721,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.48333333333333334,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"loss": 0.00037222355604171753,
|
||
|
|
"grad_norm": 0.703125,
|
||
|
|
"learning_rate": 4.229604321829561e-09,
|
||
|
|
"num_tokens": 407760.0,
|
||
|
|
"completions/mean_length": 400.0,
|
||
|
|
"completions/min_length": 400.0,
|
||
|
|
"completions/max_length": 400.0,
|
||
|
|
"completions/clipped_ratio": 1.0,
|
||
|
|
"completions/mean_terminated_length": 0.0,
|
||
|
|
"completions/min_terminated_length": 0.0,
|
||
|
|
"completions/max_terminated_length": 0.0,
|
||
|
|
"rewards/reward_format/mean": -0.46875,
|
||
|
|
"rewards/reward_format/std": 0.6302918791770935,
|
||
|
|
"rewards/reward_welfare/mean": 0.125,
|
||
|
|
"rewards/reward_welfare/std": 0.3535533845424652,
|
||
|
|
"rewards/reward_fairness/mean": 0.015625,
|
||
|
|
"rewards/reward_fairness/std": 0.04419417306780815,
|
||
|
|
"rewards/reward_stability/mean": 1.0,
|
||
|
|
"rewards/reward_stability/std": 0.0,
|
||
|
|
"rewards/reward_composite/mean": 0.018206155858933926,
|
||
|
|
"rewards/reward_composite/std": 0.05149478651583195,
|
||
|
|
"reward": 0.6900811493396759,
|
||
|
|
"reward_std": 0.873493492603302,
|
||
|
|
"frac_reward_zero_std": 0.0,
|
||
|
|
"completion_length": 400.0,
|
||
|
|
"kl": 0.3722131997346878,
|
||
|
|
"clip_ratio/low_mean": 0.0,
|
||
|
|
"clip_ratio/low_min": 0.0,
|
||
|
|
"clip_ratio/high_mean": 0.0,
|
||
|
|
"clip_ratio/high_max": 0.0,
|
||
|
|
"clip_ratio/region_mean": 0.0,
|
||
|
|
"epoch": 0.5,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"train_runtime": 1880.1016,
|
||
|
|
"train_samples_per_second": 0.255,
|
||
|
|
"train_steps_per_second": 0.032,
|
||
|
|
"total_flos": 0.0,
|
||
|
|
"train_loss": 0.0003991109939912955,
|
||
|
|
"epoch": 0.5,
|
||
|
|
"step": 60
|
||
|
|
}
|
||
|
|
]
|
||
|
|
}
|