Files
daedalus-designer-v2/training_history.json

2313 lines
86 KiB
JSON
Raw Permalink Normal View History

{
"phase": "sft+grpo",
"sft_history": [
{
"loss": 1.8329996109008788,
"grad_norm": 2.6284756660461426,
"learning_rate": 2.6666666666666667e-05,
"epoch": 0.16666666666666666,
"step": 5
},
{
"loss": 1.641743278503418,
"grad_norm": 0.9074174761772156,
"learning_rate": 6e-05,
"epoch": 0.3333333333333333,
"step": 10
},
{
"loss": 1.3325251579284667,
"grad_norm": 0.772527277469635,
"learning_rate": 9.333333333333334e-05,
"epoch": 0.5,
"step": 15
},
{
"loss": 0.908332347869873,
"grad_norm": 0.8558230400085449,
"learning_rate": 0.00012666666666666666,
"epoch": 0.6666666666666666,
"step": 20
},
{
"loss": 0.4191232204437256,
"grad_norm": 0.6383947134017944,
"learning_rate": 0.00016,
"epoch": 0.8333333333333334,
"step": 25
},
{
"loss": 0.20252063274383544,
"grad_norm": 0.24536560475826263,
"learning_rate": 0.00019333333333333333,
"epoch": 1.0,
"step": 30
},
{
"loss": 0.1843562602996826,
"grad_norm": 0.1841956526041031,
"learning_rate": 0.0001913545457642601,
"epoch": 1.1666666666666667,
"step": 35
},
{
"loss": 0.1743373155593872,
"grad_norm": 0.12225674837827682,
"learning_rate": 0.00015877852522924732,
"epoch": 1.3333333333333333,
"step": 40
},
{
"loss": 0.1707882285118103,
"grad_norm": 0.11675203591585159,
"learning_rate": 0.00011045284632676536,
"epoch": 1.5,
"step": 45
},
{
"loss": 0.17305984497070312,
"grad_norm": 0.168966606259346,
"learning_rate": 5.9326335692419995e-05,
"epoch": 1.6666666666666665,
"step": 50
},
{
"loss": 0.1723298192024231,
"grad_norm": 0.14092567563056946,
"learning_rate": 1.9098300562505266e-05,
"epoch": 1.8333333333333335,
"step": 55
},
{
"loss": 0.16860610246658325,
"grad_norm": 0.13329552114009857,
"learning_rate": 5.478104631726711e-07,
"epoch": 2.0,
"step": 60
},
{
"train_runtime": 1079.0765,
"train_samples_per_second": 1.112,
"train_steps_per_second": 0.056,
"total_flos": 5520149869086720.0,
"train_loss": 0.6150601516167323,
"epoch": 2.0,
"step": 60
}
],
"grpo_history": [
{
"loss": 0.00047351792454719543,
"grad_norm": 0.72265625,
"learning_rate": 8.333333333333333e-07,
"num_tokens": 13592.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.4375,
"rewards/reward_format/std": 0.3535533770918846,
"rewards/reward_welfare/mean": 0.0625,
"rewards/reward_welfare/std": 0.1767766922712326,
"rewards/reward_fairness/mean": 0.03318497911095619,
"rewards/reward_fairness/std": 0.09386129677295685,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.02344914712011814,
"rewards/reward_composite/std": 0.06632420420646667,
"reward": 0.6816341280937195,
"reward_std": 0.48826825618743896,
"frac_reward_zero_std": 0.5,
"completion_length": 400.0,
"kl": 0.47351907938718796,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.016666666666666666,
"step": 2
},
{
"loss": 0.00041250139474868774,
"grad_norm": 0.68359375,
"learning_rate": 2.5e-06,
"num_tokens": 26592.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.33522727340459824,
"rewards/reward_format/std": 0.3089452385902405,
"rewards/reward_welfare/mean": 0.125,
"rewards/reward_welfare/std": 0.2314550280570984,
"rewards/reward_fairness/mean": 0.037382133305072784,
"rewards/reward_fairness/std": 0.07023922353982925,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.03359023481607437,
"rewards/reward_composite/std": 0.06231452897191048,
"reward": 0.8607450723648071,
"reward_std": 0.4173068106174469,
"frac_reward_zero_std": 0.75,
"completion_length": 400.0,
"kl": 0.4125128909945488,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.03333333333333333,
"step": 4
},
{
"loss": 0.0003954425919800997,
"grad_norm": 0.00665283203125,
"learning_rate": 4.166666666666667e-06,
"num_tokens": 39888.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.53125,
"rewards/reward_format/std": 0.0883883461356163,
"rewards/reward_welfare/mean": 0.0,
"rewards/reward_welfare/std": 0.0,
"rewards/reward_fairness/mean": 0.0,
"rewards/reward_fairness/std": 0.0,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.0,
"rewards/reward_composite/std": 0.0,
"reward": 0.46875,
"reward_std": 0.0625,
"frac_reward_zero_std": 0.75,
"completion_length": 400.0,
"kl": 0.3954422175884247,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.05,
"step": 6
},
{
"loss": 0.00040989843546412885,
"grad_norm": 0.013671875,
"learning_rate": 4.995770395678171e-06,
"num_tokens": 53776.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.5,
"rewards/reward_format/std": 0.0,
"rewards/reward_welfare/mean": 0.0,
"rewards/reward_welfare/std": 0.0,
"rewards/reward_fairness/mean": 0.0,
"rewards/reward_fairness/std": 0.0,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.0,
"rewards/reward_composite/std": 0.0,
"reward": 0.5,
"reward_std": 0.0,
"frac_reward_zero_std": 1.0,
"completion_length": 400.0,
"kl": 0.40989840030670166,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.06666666666666667,
"step": 8
},
{
"loss": 0.00042488425970077515,
"grad_norm": 0.4921875,
"learning_rate": 4.962019382530521e-06,
"num_tokens": 67664.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.46875,
"rewards/reward_format/std": 0.2651650384068489,
"rewards/reward_welfare/mean": 0.0625,
"rewards/reward_welfare/std": 0.1767766922712326,
"rewards/reward_fairness/mean": 0.02351469174027443,
"rewards/reward_fairness/std": 0.06650959700345993,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.023048987612128258,
"rewards/reward_composite/std": 0.06519238650798798,
"reward": 0.6403136849403381,
"reward_std": 0.40562736988067627,
"frac_reward_zero_std": 0.5,
"completion_length": 400.0,
"kl": 0.42487896233797073,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.08333333333333333,
"step": 10
},
{
"loss": 0.0003492364485282451,
"grad_norm": 0.703125,
"learning_rate": 4.894973780788722e-06,
"num_tokens": 81552.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.34659090638160706,
"rewards/reward_format/std": 0.6987431943416595,
"rewards/reward_welfare/mean": 0.1875,
"rewards/reward_welfare/std": 0.408231720328331,
"rewards/reward_fairness/mean": 0.060186946764588356,
"rewards/reward_fairness/std": 0.13657810539007187,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.05276940576732159,
"rewards/reward_composite/std": 0.11823124438524246,
"reward": 0.9538654386997223,
"reward_std": 1.2448847889900208,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.34924405813217163,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.1,
"step": 12
},
{
"loss": 0.00039254588773474097,
"grad_norm": 0.7890625,
"learning_rate": 4.7955402672006855e-06,
"num_tokens": 95440.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.2755681872367859,
"rewards/reward_format/std": 0.5705045461654663,
"rewards/reward_welfare/mean": 0.1875,
"rewards/reward_welfare/std": 0.408231720328331,
"rewards/reward_fairness/mean": 0.05876787751913071,
"rewards/reward_fairness/std": 0.13999952003359795,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.051001692190766335,
"rewards/reward_composite/std": 0.12243235111236572,
"reward": 1.0217013657093048,
"reward_std": 1.1684027314186096,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3925560265779495,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.11666666666666667,
"step": 14
},
{
"loss": 0.00040383817395195365,
"grad_norm": 0.86328125,
"learning_rate": 4.665063509461098e-06,
"num_tokens": 108736.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.16477272659540176,
"rewards/reward_format/std": 0.6252594292163849,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.10286042466759682,
"rewards/reward_fairness/std": 0.16851608455181122,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.09322065114974976,
"rewards/reward_composite/std": 0.1486019790172577,
"reward": 1.343808352947235,
"reward_std": 1.4776567816734314,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.40385157614946365,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.13333333333333333,
"step": 16
},
{
"loss": 0.0004179440438747406,
"grad_norm": 0.65234375,
"learning_rate": 4.50530798188761e-06,
"num_tokens": 122624.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.34375,
"rewards/reward_format/std": 0.5564062297344208,
"rewards/reward_welfare/mean": 0.125,
"rewards/reward_welfare/std": 0.3535533845424652,
"rewards/reward_fairness/mean": 0.04079132154583931,
"rewards/reward_fairness/std": 0.11537527851760387,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.035708064679056406,
"rewards/reward_composite/std": 0.10099766962230206,
"reward": 0.8577493727207184,
"reward_std": 0.8404987752437592,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.41793932020664215,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.15,
"step": 18
},
{
"loss": 0.00039753690361976624,
"grad_norm": 0.75,
"learning_rate": 4.318434103932622e-06,
"num_tokens": 136512.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.35795454680919647,
"rewards/reward_format/std": 0.6613655686378479,
"rewards/reward_welfare/mean": 0.1875,
"rewards/reward_welfare/std": 0.408231720328331,
"rewards/reward_fairness/mean": 0.05811220221221447,
"rewards/reward_fairness/std": 0.1433359570801258,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.04960842803120613,
"rewards/reward_composite/std": 0.11969681829214096,
"reward": 0.9372660517692566,
"reward_std": 0.9138101935386658,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.39755555987358093,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.16666666666666666,
"step": 20
},
{
"loss": 0.0003871597582474351,
"grad_norm": 0.005157470703125,
"learning_rate": 4.106969024216348e-06,
"num_tokens": 150104.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.21875,
"rewards/reward_format/std": 0.38816189765930176,
"rewards/reward_welfare/mean": 0.1875,
"rewards/reward_welfare/std": 0.25877460837364197,
"rewards/reward_fairness/mean": 0.09772966802120209,
"rewards/reward_fairness/std": 0.1411271095275879,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.07592316716909409,
"rewards/reward_composite/std": 0.10568810254335403,
"reward": 1.1424028873443604,
"reward_std": 0.9167249202728271,
"frac_reward_zero_std": 0.5,
"completion_length": 400.0,
"kl": 0.3871647119522095,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.18333333333333332,
"step": 22
},
{
"loss": 0.00046034157276153564,
"grad_norm": 0.828125,
"learning_rate": 3.8737724451770155e-06,
"num_tokens": 163992.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.053977273404598236,
"rewards/reward_format/std": 0.6971071362495422,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.11436978727579117,
"rewards/reward_fairness/std": 0.21275469660758972,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.08883418142795563,
"rewards/reward_composite/std": 0.15858761221170425,
"reward": 1.4617266654968262,
"reward_std": 1.6014615297317505,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.46036188304424286,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.2,
"step": 24
},
{
"loss": 0.00037954188883304596,
"grad_norm": 0.75390625,
"learning_rate": 3.621997950501156e-06,
"num_tokens": 177584.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.11079545319080353,
"rewards/reward_format/std": 0.7605703175067902,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.13740837946534157,
"rewards/reward_fairness/std": 0.23384775966405869,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.1134110763669014,
"rewards/reward_composite/std": 0.1934959888458252,
"reward": 1.4525240659713745,
"reward_std": 1.4633366465568542,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3795487657189369,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.21666666666666667,
"step": 26
},
{
"loss": 0.00034568458795547485,
"grad_norm": 0.67578125,
"learning_rate": 3.3550503583141726e-06,
"num_tokens": 190880.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.014204561710357666,
"rewards/reward_format/std": 0.45640653371810913,
"rewards/reward_welfare/mean": 0.375,
"rewards/reward_welfare/std": 0.2314550280570984,
"rewards/reward_fairness/mean": 0.17191734910011292,
"rewards/reward_fairness/std": 0.165505051612854,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.12545911967754364,
"rewards/reward_composite/std": 0.09392639249563217,
"reward": 1.6581718921661377,
"reward_std": 0.9023097902536392,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.3456726223230362,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.23333333333333334,
"step": 28
},
{
"loss": 0.0004424452781677246,
"grad_norm": 0.79296875,
"learning_rate": 3.0765396768561005e-06,
"num_tokens": 204472.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.05681818723678589,
"rewards/reward_format/std": 0.8016891181468964,
"rewards/reward_welfare/mean": 0.375,
"rewards/reward_welfare/std": 0.49871626496315,
"rewards/reward_fairness/mean": 0.11990131065249443,
"rewards/reward_fairness/std": 0.19920051097869873,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.10109234228730202,
"rewards/reward_composite/std": 0.15720761567354202,
"reward": 1.5391755104064941,
"reward_std": 1.311523675918579,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.44244876503944397,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.25,
"step": 30
},
{
"loss": 0.00044108927249908447,
"grad_norm": 0.79296875,
"learning_rate": 2.7902322853130758e-06,
"num_tokens": 218360.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.23011364042758942,
"rewards/reward_format/std": 0.7126934230327606,
"rewards/reward_welfare/mean": 0.25,
"rewards/reward_welfare/std": 0.4355513006448746,
"rewards/reward_fairness/mean": 0.09009831957519054,
"rewards/reward_fairness/std": 0.17519650608301163,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.07756261341273785,
"rewards/reward_composite/std": 0.1447325348854065,
"reward": 1.1875473260879517,
"reward_std": 1.3125466108322144,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.4411006420850754,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.26666666666666666,
"step": 32
},
{
"loss": 0.00043725594878196716,
"grad_norm": 0.765625,
"learning_rate": 2.5e-06,
"num_tokens": 232248.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": 0.3238636404275894,
"rewards/reward_format/std": 0.7694187164306641,
"rewards/reward_welfare/mean": 0.5625,
"rewards/reward_welfare/std": 0.5260358452796936,
"rewards/reward_fairness/mean": 0.19406583905220032,
"rewards/reward_fairness/std": 0.20889797061681747,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.1601117104291916,
"rewards/reward_composite/std": 0.16504594683647156,
"reward": 2.2405412197113037,
"reward_std": 1.72732412815094,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.43723437190055847,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.2833333333333333,
"step": 34
},
{
"loss": 0.0003281831741333008,
"grad_norm": 0.66015625,
"learning_rate": 2.2097677146869242e-06,
"num_tokens": 245840.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.0625,
"rewards/reward_format/std": 0.8398386240005493,
"rewards/reward_welfare/mean": 0.375,
"rewards/reward_welfare/std": 0.5175492167472839,
"rewards/reward_fairness/mean": 0.208244688808918,
"rewards/reward_fairness/std": 0.3558191955089569,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.12120818346738815,
"rewards/reward_composite/std": 0.17605619877576828,
"reward": 1.6419528722763062,
"reward_std": 1.9059234857559204,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.32817772775888443,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.3,
"step": 36
},
{
"loss": 0.0003703221445903182,
"grad_norm": 0.65625,
"learning_rate": 1.9234603231439e-06,
"num_tokens": 258840.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.34375,
"rewards/reward_format/std": 0.4355708882212639,
"rewards/reward_welfare/mean": 0.125,
"rewards/reward_welfare/std": 0.2314550280570984,
"rewards/reward_fairness/mean": 0.032129574567079544,
"rewards/reward_fairness/std": 0.06755802780389786,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.0333767905831337,
"rewards/reward_composite/std": 0.07086637616157532,
"reward": 0.8467563986778259,
"reward_std": 0.8185127973556519,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.3703107312321663,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.31666666666666665,
"step": 38
},
{
"loss": 0.0003897678107023239,
"grad_norm": 0.76171875,
"learning_rate": 1.6449496416858285e-06,
"num_tokens": 272136.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": 0.07102272659540176,
"rewards/reward_format/std": 0.8448813557624817,
"rewards/reward_welfare/mean": 0.4375,
"rewards/reward_welfare/std": 0.5260358452796936,
"rewards/reward_fairness/mean": 0.13828522339463234,
"rewards/reward_fairness/std": 0.19835777580738068,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.11603889241814613,
"rewards/reward_composite/std": 0.15580761432647705,
"reward": 1.762846827507019,
"reward_std": 1.750555157661438,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.38978311419487,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.3333333333333333,
"step": 40
},
{
"loss": 0.00036280229687690735,
"grad_norm": 0.52734375,
"learning_rate": 1.3780020494988447e-06,
"num_tokens": 286024.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.21875,
"rewards/reward_format/std": 0.7038120031356812,
"rewards/reward_welfare/mean": 0.25,
"rewards/reward_welfare/std": 0.4629100561141968,
"rewards/reward_fairness/mean": 0.08049380034208298,
"rewards/reward_fairness/std": 0.14985806494951248,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.06840312853455544,
"rewards/reward_composite/std": 0.12792598456144333,
"reward": 1.1801469326019287,
"reward_std": 1.2528201341629028,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.36278442293405533,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.35,
"step": 42
},
{
"loss": 0.00041984766721725464,
"grad_norm": 0.734375,
"learning_rate": 1.1262275548229852e-06,
"num_tokens": 299320.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.125,
"rewards/reward_format/std": 0.7535476386547089,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.44403792917728424,
"rewards/reward_fairness/mean": 0.08010485023260117,
"rewards/reward_fairness/std": 0.13119615614414215,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.06627386063337326,
"rewards/reward_composite/std": 0.0994122326374054,
"reward": 1.333878755569458,
"reward_std": 1.2175767719745636,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.4198339805006981,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.36666666666666664,
"step": 44
},
{
"loss": 0.00037025846540927887,
"grad_norm": 0.74609375,
"learning_rate": 8.930309757836517e-07,
"num_tokens": 312912.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.15625,
"rewards/reward_format/std": 0.7469770908355713,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.44403792917728424,
"rewards/reward_fairness/mean": 0.0987030416727066,
"rewards/reward_fairness/std": 0.13824082165956497,
"rewards/reward_stability/mean": 0.9375,
"rewards/reward_stability/std": 0.1767766922712326,
"rewards/reward_composite/mean": 0.0686455499380827,
"rewards/reward_composite/std": 0.11963466554880142,
"reward": 1.2610985934734344,
"reward_std": 1.1380138397216797,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.3702595606446266,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.38333333333333336,
"step": 46
},
{
"loss": 0.00037697795778512955,
"grad_norm": 0.68359375,
"learning_rate": 6.815658960673782e-07,
"num_tokens": 326208.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": 0.07954545319080353,
"rewards/reward_format/std": 0.7752929627895355,
"rewards/reward_welfare/mean": 0.4375,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.1642819568514824,
"rewards/reward_fairness/std": 0.2405308187007904,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.12362180650234222,
"rewards/reward_composite/std": 0.16839426010847092,
"reward": 1.8049492835998535,
"reward_std": 1.6678152084350586,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3769652917981148,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.4,
"step": 48
},
{
"loss": 0.00039646029472351074,
"grad_norm": 0.6953125,
"learning_rate": 4.946920181123904e-07,
"num_tokens": 339800.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.06818181276321411,
"rewards/reward_format/std": 0.7570639848709106,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.12952633947134018,
"rewards/reward_fairness/std": 0.2085839882493019,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.10791970416903496,
"rewards/reward_composite/std": 0.1719568744301796,
"reward": 1.4817642569541931,
"reward_std": 1.4108701944351196,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.39645931124687195,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.4166666666666667,
"step": 50
},
{
"loss": 0.000441722571849823,
"grad_norm": 0.7265625,
"learning_rate": 3.3493649053890325e-07,
"num_tokens": 353688.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": 0.03125,
"rewards/reward_format/std": 0.8107390403747559,
"rewards/reward_welfare/mean": 0.375,
"rewards/reward_welfare/std": 0.5175492167472839,
"rewards/reward_fairness/mean": 0.10219378396868706,
"rewards/reward_fairness/std": 0.1520508974790573,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.09825712814927101,
"rewards/reward_composite/std": 0.1404884159564972,
"reward": 1.6067009568214417,
"reward_std": 1.2484731674194336,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.44170165807008743,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.43333333333333335,
"step": 52
},
{
"loss": 0.00037848297506570816,
"grad_norm": 0.796875,
"learning_rate": 2.044597327993153e-07,
"num_tokens": 367280.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": 0.23579545319080353,
"rewards/reward_format/std": 0.7612137198448181,
"rewards/reward_welfare/mean": 0.5625,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.2322249710559845,
"rewards/reward_fairness/std": 0.20398348569869995,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.19660773873329163,
"rewards/reward_composite/std": 0.17612425237894058,
"reward": 2.2271281480789185,
"reward_std": 1.4199119210243225,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3786723464727402,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.45,
"step": 54
},
{
"loss": 0.00040780752897262573,
"grad_norm": 0.65234375,
"learning_rate": 1.0502621921127776e-07,
"num_tokens": 380872.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.0625,
"rewards/reward_format/std": 0.7646470665931702,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.09930047020316124,
"rewards/reward_fairness/std": 0.18247877806425095,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.08683200180530548,
"rewards/reward_composite/std": 0.15039421617984772,
"reward": 1.4361324906349182,
"reward_std": 1.6369856595993042,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.4077882617712021,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.4666666666666667,
"step": 56
},
{
"loss": 0.00038760900497436523,
"grad_norm": 0.74609375,
"learning_rate": 3.798061746947995e-08,
"num_tokens": 394168.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.002840910106897354,
"rewards/reward_format/std": 0.8362354636192322,
"rewards/reward_welfare/mean": 0.375,
"rewards/reward_welfare/std": 0.5175492167472839,
"rewards/reward_fairness/mean": 0.11489119380712509,
"rewards/reward_fairness/std": 0.17957812547683716,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.11398349702358246,
"rewards/reward_composite/std": 0.17128486931324005,
"reward": 1.6010336875915527,
"reward_std": 1.725760817527771,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3876567706465721,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.48333333333333334,
"step": 58
},
{
"loss": 0.00037222355604171753,
"grad_norm": 0.703125,
"learning_rate": 4.229604321829561e-09,
"num_tokens": 407760.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.46875,
"rewards/reward_format/std": 0.6302918791770935,
"rewards/reward_welfare/mean": 0.125,
"rewards/reward_welfare/std": 0.3535533845424652,
"rewards/reward_fairness/mean": 0.015625,
"rewards/reward_fairness/std": 0.04419417306780815,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.018206155858933926,
"rewards/reward_composite/std": 0.05149478651583195,
"reward": 0.6900811493396759,
"reward_std": 0.873493492603302,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3722131997346878,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.5,
"step": 60
},
{
"train_runtime": 1880.1016,
"train_samples_per_second": 0.255,
"train_steps_per_second": 0.032,
"total_flos": 0.0,
"train_loss": 0.0003991109939912955,
"epoch": 0.5,
"step": 60
}
],
"history": [
{
"loss": 1.8329996109008788,
"grad_norm": 2.6284756660461426,
"learning_rate": 2.6666666666666667e-05,
"epoch": 0.16666666666666666,
"step": 5
},
{
"loss": 1.641743278503418,
"grad_norm": 0.9074174761772156,
"learning_rate": 6e-05,
"epoch": 0.3333333333333333,
"step": 10
},
{
"loss": 1.3325251579284667,
"grad_norm": 0.772527277469635,
"learning_rate": 9.333333333333334e-05,
"epoch": 0.5,
"step": 15
},
{
"loss": 0.908332347869873,
"grad_norm": 0.8558230400085449,
"learning_rate": 0.00012666666666666666,
"epoch": 0.6666666666666666,
"step": 20
},
{
"loss": 0.4191232204437256,
"grad_norm": 0.6383947134017944,
"learning_rate": 0.00016,
"epoch": 0.8333333333333334,
"step": 25
},
{
"loss": 0.20252063274383544,
"grad_norm": 0.24536560475826263,
"learning_rate": 0.00019333333333333333,
"epoch": 1.0,
"step": 30
},
{
"loss": 0.1843562602996826,
"grad_norm": 0.1841956526041031,
"learning_rate": 0.0001913545457642601,
"epoch": 1.1666666666666667,
"step": 35
},
{
"loss": 0.1743373155593872,
"grad_norm": 0.12225674837827682,
"learning_rate": 0.00015877852522924732,
"epoch": 1.3333333333333333,
"step": 40
},
{
"loss": 0.1707882285118103,
"grad_norm": 0.11675203591585159,
"learning_rate": 0.00011045284632676536,
"epoch": 1.5,
"step": 45
},
{
"loss": 0.17305984497070312,
"grad_norm": 0.168966606259346,
"learning_rate": 5.9326335692419995e-05,
"epoch": 1.6666666666666665,
"step": 50
},
{
"loss": 0.1723298192024231,
"grad_norm": 0.14092567563056946,
"learning_rate": 1.9098300562505266e-05,
"epoch": 1.8333333333333335,
"step": 55
},
{
"loss": 0.16860610246658325,
"grad_norm": 0.13329552114009857,
"learning_rate": 5.478104631726711e-07,
"epoch": 2.0,
"step": 60
},
{
"train_runtime": 1079.0765,
"train_samples_per_second": 1.112,
"train_steps_per_second": 0.056,
"total_flos": 5520149869086720.0,
"train_loss": 0.6150601516167323,
"epoch": 2.0,
"step": 60
},
{
"loss": 0.00047351792454719543,
"grad_norm": 0.72265625,
"learning_rate": 8.333333333333333e-07,
"num_tokens": 13592.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.4375,
"rewards/reward_format/std": 0.3535533770918846,
"rewards/reward_welfare/mean": 0.0625,
"rewards/reward_welfare/std": 0.1767766922712326,
"rewards/reward_fairness/mean": 0.03318497911095619,
"rewards/reward_fairness/std": 0.09386129677295685,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.02344914712011814,
"rewards/reward_composite/std": 0.06632420420646667,
"reward": 0.6816341280937195,
"reward_std": 0.48826825618743896,
"frac_reward_zero_std": 0.5,
"completion_length": 400.0,
"kl": 0.47351907938718796,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.016666666666666666,
"step": 2
},
{
"loss": 0.00041250139474868774,
"grad_norm": 0.68359375,
"learning_rate": 2.5e-06,
"num_tokens": 26592.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.33522727340459824,
"rewards/reward_format/std": 0.3089452385902405,
"rewards/reward_welfare/mean": 0.125,
"rewards/reward_welfare/std": 0.2314550280570984,
"rewards/reward_fairness/mean": 0.037382133305072784,
"rewards/reward_fairness/std": 0.07023922353982925,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.03359023481607437,
"rewards/reward_composite/std": 0.06231452897191048,
"reward": 0.8607450723648071,
"reward_std": 0.4173068106174469,
"frac_reward_zero_std": 0.75,
"completion_length": 400.0,
"kl": 0.4125128909945488,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.03333333333333333,
"step": 4
},
{
"loss": 0.0003954425919800997,
"grad_norm": 0.00665283203125,
"learning_rate": 4.166666666666667e-06,
"num_tokens": 39888.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.53125,
"rewards/reward_format/std": 0.0883883461356163,
"rewards/reward_welfare/mean": 0.0,
"rewards/reward_welfare/std": 0.0,
"rewards/reward_fairness/mean": 0.0,
"rewards/reward_fairness/std": 0.0,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.0,
"rewards/reward_composite/std": 0.0,
"reward": 0.46875,
"reward_std": 0.0625,
"frac_reward_zero_std": 0.75,
"completion_length": 400.0,
"kl": 0.3954422175884247,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.05,
"step": 6
},
{
"loss": 0.00040989843546412885,
"grad_norm": 0.013671875,
"learning_rate": 4.995770395678171e-06,
"num_tokens": 53776.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.5,
"rewards/reward_format/std": 0.0,
"rewards/reward_welfare/mean": 0.0,
"rewards/reward_welfare/std": 0.0,
"rewards/reward_fairness/mean": 0.0,
"rewards/reward_fairness/std": 0.0,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.0,
"rewards/reward_composite/std": 0.0,
"reward": 0.5,
"reward_std": 0.0,
"frac_reward_zero_std": 1.0,
"completion_length": 400.0,
"kl": 0.40989840030670166,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.06666666666666667,
"step": 8
},
{
"loss": 0.00042488425970077515,
"grad_norm": 0.4921875,
"learning_rate": 4.962019382530521e-06,
"num_tokens": 67664.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.46875,
"rewards/reward_format/std": 0.2651650384068489,
"rewards/reward_welfare/mean": 0.0625,
"rewards/reward_welfare/std": 0.1767766922712326,
"rewards/reward_fairness/mean": 0.02351469174027443,
"rewards/reward_fairness/std": 0.06650959700345993,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.023048987612128258,
"rewards/reward_composite/std": 0.06519238650798798,
"reward": 0.6403136849403381,
"reward_std": 0.40562736988067627,
"frac_reward_zero_std": 0.5,
"completion_length": 400.0,
"kl": 0.42487896233797073,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.08333333333333333,
"step": 10
},
{
"loss": 0.0003492364485282451,
"grad_norm": 0.703125,
"learning_rate": 4.894973780788722e-06,
"num_tokens": 81552.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.34659090638160706,
"rewards/reward_format/std": 0.6987431943416595,
"rewards/reward_welfare/mean": 0.1875,
"rewards/reward_welfare/std": 0.408231720328331,
"rewards/reward_fairness/mean": 0.060186946764588356,
"rewards/reward_fairness/std": 0.13657810539007187,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.05276940576732159,
"rewards/reward_composite/std": 0.11823124438524246,
"reward": 0.9538654386997223,
"reward_std": 1.2448847889900208,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.34924405813217163,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.1,
"step": 12
},
{
"loss": 0.00039254588773474097,
"grad_norm": 0.7890625,
"learning_rate": 4.7955402672006855e-06,
"num_tokens": 95440.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.2755681872367859,
"rewards/reward_format/std": 0.5705045461654663,
"rewards/reward_welfare/mean": 0.1875,
"rewards/reward_welfare/std": 0.408231720328331,
"rewards/reward_fairness/mean": 0.05876787751913071,
"rewards/reward_fairness/std": 0.13999952003359795,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.051001692190766335,
"rewards/reward_composite/std": 0.12243235111236572,
"reward": 1.0217013657093048,
"reward_std": 1.1684027314186096,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3925560265779495,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.11666666666666667,
"step": 14
},
{
"loss": 0.00040383817395195365,
"grad_norm": 0.86328125,
"learning_rate": 4.665063509461098e-06,
"num_tokens": 108736.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.16477272659540176,
"rewards/reward_format/std": 0.6252594292163849,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.10286042466759682,
"rewards/reward_fairness/std": 0.16851608455181122,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.09322065114974976,
"rewards/reward_composite/std": 0.1486019790172577,
"reward": 1.343808352947235,
"reward_std": 1.4776567816734314,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.40385157614946365,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.13333333333333333,
"step": 16
},
{
"loss": 0.0004179440438747406,
"grad_norm": 0.65234375,
"learning_rate": 4.50530798188761e-06,
"num_tokens": 122624.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.34375,
"rewards/reward_format/std": 0.5564062297344208,
"rewards/reward_welfare/mean": 0.125,
"rewards/reward_welfare/std": 0.3535533845424652,
"rewards/reward_fairness/mean": 0.04079132154583931,
"rewards/reward_fairness/std": 0.11537527851760387,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.035708064679056406,
"rewards/reward_composite/std": 0.10099766962230206,
"reward": 0.8577493727207184,
"reward_std": 0.8404987752437592,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.41793932020664215,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.15,
"step": 18
},
{
"loss": 0.00039753690361976624,
"grad_norm": 0.75,
"learning_rate": 4.318434103932622e-06,
"num_tokens": 136512.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.35795454680919647,
"rewards/reward_format/std": 0.6613655686378479,
"rewards/reward_welfare/mean": 0.1875,
"rewards/reward_welfare/std": 0.408231720328331,
"rewards/reward_fairness/mean": 0.05811220221221447,
"rewards/reward_fairness/std": 0.1433359570801258,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.04960842803120613,
"rewards/reward_composite/std": 0.11969681829214096,
"reward": 0.9372660517692566,
"reward_std": 0.9138101935386658,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.39755555987358093,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.16666666666666666,
"step": 20
},
{
"loss": 0.0003871597582474351,
"grad_norm": 0.005157470703125,
"learning_rate": 4.106969024216348e-06,
"num_tokens": 150104.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.21875,
"rewards/reward_format/std": 0.38816189765930176,
"rewards/reward_welfare/mean": 0.1875,
"rewards/reward_welfare/std": 0.25877460837364197,
"rewards/reward_fairness/mean": 0.09772966802120209,
"rewards/reward_fairness/std": 0.1411271095275879,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.07592316716909409,
"rewards/reward_composite/std": 0.10568810254335403,
"reward": 1.1424028873443604,
"reward_std": 0.9167249202728271,
"frac_reward_zero_std": 0.5,
"completion_length": 400.0,
"kl": 0.3871647119522095,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.18333333333333332,
"step": 22
},
{
"loss": 0.00046034157276153564,
"grad_norm": 0.828125,
"learning_rate": 3.8737724451770155e-06,
"num_tokens": 163992.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.053977273404598236,
"rewards/reward_format/std": 0.6971071362495422,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.11436978727579117,
"rewards/reward_fairness/std": 0.21275469660758972,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.08883418142795563,
"rewards/reward_composite/std": 0.15858761221170425,
"reward": 1.4617266654968262,
"reward_std": 1.6014615297317505,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.46036188304424286,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.2,
"step": 24
},
{
"loss": 0.00037954188883304596,
"grad_norm": 0.75390625,
"learning_rate": 3.621997950501156e-06,
"num_tokens": 177584.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.11079545319080353,
"rewards/reward_format/std": 0.7605703175067902,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.13740837946534157,
"rewards/reward_fairness/std": 0.23384775966405869,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.1134110763669014,
"rewards/reward_composite/std": 0.1934959888458252,
"reward": 1.4525240659713745,
"reward_std": 1.4633366465568542,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3795487657189369,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.21666666666666667,
"step": 26
},
{
"loss": 0.00034568458795547485,
"grad_norm": 0.67578125,
"learning_rate": 3.3550503583141726e-06,
"num_tokens": 190880.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.014204561710357666,
"rewards/reward_format/std": 0.45640653371810913,
"rewards/reward_welfare/mean": 0.375,
"rewards/reward_welfare/std": 0.2314550280570984,
"rewards/reward_fairness/mean": 0.17191734910011292,
"rewards/reward_fairness/std": 0.165505051612854,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.12545911967754364,
"rewards/reward_composite/std": 0.09392639249563217,
"reward": 1.6581718921661377,
"reward_std": 0.9023097902536392,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.3456726223230362,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.23333333333333334,
"step": 28
},
{
"loss": 0.0004424452781677246,
"grad_norm": 0.79296875,
"learning_rate": 3.0765396768561005e-06,
"num_tokens": 204472.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.05681818723678589,
"rewards/reward_format/std": 0.8016891181468964,
"rewards/reward_welfare/mean": 0.375,
"rewards/reward_welfare/std": 0.49871626496315,
"rewards/reward_fairness/mean": 0.11990131065249443,
"rewards/reward_fairness/std": 0.19920051097869873,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.10109234228730202,
"rewards/reward_composite/std": 0.15720761567354202,
"reward": 1.5391755104064941,
"reward_std": 1.311523675918579,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.44244876503944397,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.25,
"step": 30
},
{
"loss": 0.00044108927249908447,
"grad_norm": 0.79296875,
"learning_rate": 2.7902322853130758e-06,
"num_tokens": 218360.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.23011364042758942,
"rewards/reward_format/std": 0.7126934230327606,
"rewards/reward_welfare/mean": 0.25,
"rewards/reward_welfare/std": 0.4355513006448746,
"rewards/reward_fairness/mean": 0.09009831957519054,
"rewards/reward_fairness/std": 0.17519650608301163,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.07756261341273785,
"rewards/reward_composite/std": 0.1447325348854065,
"reward": 1.1875473260879517,
"reward_std": 1.3125466108322144,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.4411006420850754,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.26666666666666666,
"step": 32
},
{
"loss": 0.00043725594878196716,
"grad_norm": 0.765625,
"learning_rate": 2.5e-06,
"num_tokens": 232248.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": 0.3238636404275894,
"rewards/reward_format/std": 0.7694187164306641,
"rewards/reward_welfare/mean": 0.5625,
"rewards/reward_welfare/std": 0.5260358452796936,
"rewards/reward_fairness/mean": 0.19406583905220032,
"rewards/reward_fairness/std": 0.20889797061681747,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.1601117104291916,
"rewards/reward_composite/std": 0.16504594683647156,
"reward": 2.2405412197113037,
"reward_std": 1.72732412815094,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.43723437190055847,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.2833333333333333,
"step": 34
},
{
"loss": 0.0003281831741333008,
"grad_norm": 0.66015625,
"learning_rate": 2.2097677146869242e-06,
"num_tokens": 245840.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.0625,
"rewards/reward_format/std": 0.8398386240005493,
"rewards/reward_welfare/mean": 0.375,
"rewards/reward_welfare/std": 0.5175492167472839,
"rewards/reward_fairness/mean": 0.208244688808918,
"rewards/reward_fairness/std": 0.3558191955089569,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.12120818346738815,
"rewards/reward_composite/std": 0.17605619877576828,
"reward": 1.6419528722763062,
"reward_std": 1.9059234857559204,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.32817772775888443,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.3,
"step": 36
},
{
"loss": 0.0003703221445903182,
"grad_norm": 0.65625,
"learning_rate": 1.9234603231439e-06,
"num_tokens": 258840.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.34375,
"rewards/reward_format/std": 0.4355708882212639,
"rewards/reward_welfare/mean": 0.125,
"rewards/reward_welfare/std": 0.2314550280570984,
"rewards/reward_fairness/mean": 0.032129574567079544,
"rewards/reward_fairness/std": 0.06755802780389786,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.0333767905831337,
"rewards/reward_composite/std": 0.07086637616157532,
"reward": 0.8467563986778259,
"reward_std": 0.8185127973556519,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.3703107312321663,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.31666666666666665,
"step": 38
},
{
"loss": 0.0003897678107023239,
"grad_norm": 0.76171875,
"learning_rate": 1.6449496416858285e-06,
"num_tokens": 272136.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": 0.07102272659540176,
"rewards/reward_format/std": 0.8448813557624817,
"rewards/reward_welfare/mean": 0.4375,
"rewards/reward_welfare/std": 0.5260358452796936,
"rewards/reward_fairness/mean": 0.13828522339463234,
"rewards/reward_fairness/std": 0.19835777580738068,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.11603889241814613,
"rewards/reward_composite/std": 0.15580761432647705,
"reward": 1.762846827507019,
"reward_std": 1.750555157661438,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.38978311419487,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.3333333333333333,
"step": 40
},
{
"loss": 0.00036280229687690735,
"grad_norm": 0.52734375,
"learning_rate": 1.3780020494988447e-06,
"num_tokens": 286024.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.21875,
"rewards/reward_format/std": 0.7038120031356812,
"rewards/reward_welfare/mean": 0.25,
"rewards/reward_welfare/std": 0.4629100561141968,
"rewards/reward_fairness/mean": 0.08049380034208298,
"rewards/reward_fairness/std": 0.14985806494951248,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.06840312853455544,
"rewards/reward_composite/std": 0.12792598456144333,
"reward": 1.1801469326019287,
"reward_std": 1.2528201341629028,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.36278442293405533,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.35,
"step": 42
},
{
"loss": 0.00041984766721725464,
"grad_norm": 0.734375,
"learning_rate": 1.1262275548229852e-06,
"num_tokens": 299320.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.125,
"rewards/reward_format/std": 0.7535476386547089,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.44403792917728424,
"rewards/reward_fairness/mean": 0.08010485023260117,
"rewards/reward_fairness/std": 0.13119615614414215,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.06627386063337326,
"rewards/reward_composite/std": 0.0994122326374054,
"reward": 1.333878755569458,
"reward_std": 1.2175767719745636,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.4198339805006981,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.36666666666666664,
"step": 44
},
{
"loss": 0.00037025846540927887,
"grad_norm": 0.74609375,
"learning_rate": 8.930309757836517e-07,
"num_tokens": 312912.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.15625,
"rewards/reward_format/std": 0.7469770908355713,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.44403792917728424,
"rewards/reward_fairness/mean": 0.0987030416727066,
"rewards/reward_fairness/std": 0.13824082165956497,
"rewards/reward_stability/mean": 0.9375,
"rewards/reward_stability/std": 0.1767766922712326,
"rewards/reward_composite/mean": 0.0686455499380827,
"rewards/reward_composite/std": 0.11963466554880142,
"reward": 1.2610985934734344,
"reward_std": 1.1380138397216797,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.3702595606446266,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.38333333333333336,
"step": 46
},
{
"loss": 0.00037697795778512955,
"grad_norm": 0.68359375,
"learning_rate": 6.815658960673782e-07,
"num_tokens": 326208.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": 0.07954545319080353,
"rewards/reward_format/std": 0.7752929627895355,
"rewards/reward_welfare/mean": 0.4375,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.1642819568514824,
"rewards/reward_fairness/std": 0.2405308187007904,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.12362180650234222,
"rewards/reward_composite/std": 0.16839426010847092,
"reward": 1.8049492835998535,
"reward_std": 1.6678152084350586,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3769652917981148,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.4,
"step": 48
},
{
"loss": 0.00039646029472351074,
"grad_norm": 0.6953125,
"learning_rate": 4.946920181123904e-07,
"num_tokens": 339800.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.06818181276321411,
"rewards/reward_format/std": 0.7570639848709106,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.12952633947134018,
"rewards/reward_fairness/std": 0.2085839882493019,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.10791970416903496,
"rewards/reward_composite/std": 0.1719568744301796,
"reward": 1.4817642569541931,
"reward_std": 1.4108701944351196,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.39645931124687195,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.4166666666666667,
"step": 50
},
{
"loss": 0.000441722571849823,
"grad_norm": 0.7265625,
"learning_rate": 3.3493649053890325e-07,
"num_tokens": 353688.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": 0.03125,
"rewards/reward_format/std": 0.8107390403747559,
"rewards/reward_welfare/mean": 0.375,
"rewards/reward_welfare/std": 0.5175492167472839,
"rewards/reward_fairness/mean": 0.10219378396868706,
"rewards/reward_fairness/std": 0.1520508974790573,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.09825712814927101,
"rewards/reward_composite/std": 0.1404884159564972,
"reward": 1.6067009568214417,
"reward_std": 1.2484731674194336,
"frac_reward_zero_std": 0.25,
"completion_length": 400.0,
"kl": 0.44170165807008743,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.43333333333333335,
"step": 52
},
{
"loss": 0.00037848297506570816,
"grad_norm": 0.796875,
"learning_rate": 2.044597327993153e-07,
"num_tokens": 367280.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": 0.23579545319080353,
"rewards/reward_format/std": 0.7612137198448181,
"rewards/reward_welfare/mean": 0.5625,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.2322249710559845,
"rewards/reward_fairness/std": 0.20398348569869995,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.19660773873329163,
"rewards/reward_composite/std": 0.17612425237894058,
"reward": 2.2271281480789185,
"reward_std": 1.4199119210243225,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3786723464727402,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.45,
"step": 54
},
{
"loss": 0.00040780752897262573,
"grad_norm": 0.65234375,
"learning_rate": 1.0502621921127776e-07,
"num_tokens": 380872.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.0625,
"rewards/reward_format/std": 0.7646470665931702,
"rewards/reward_welfare/mean": 0.3125,
"rewards/reward_welfare/std": 0.49022963643074036,
"rewards/reward_fairness/mean": 0.09930047020316124,
"rewards/reward_fairness/std": 0.18247877806425095,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.08683200180530548,
"rewards/reward_composite/std": 0.15039421617984772,
"reward": 1.4361324906349182,
"reward_std": 1.6369856595993042,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.4077882617712021,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.4666666666666667,
"step": 56
},
{
"loss": 0.00038760900497436523,
"grad_norm": 0.74609375,
"learning_rate": 3.798061746947995e-08,
"num_tokens": 394168.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.002840910106897354,
"rewards/reward_format/std": 0.8362354636192322,
"rewards/reward_welfare/mean": 0.375,
"rewards/reward_welfare/std": 0.5175492167472839,
"rewards/reward_fairness/mean": 0.11489119380712509,
"rewards/reward_fairness/std": 0.17957812547683716,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.11398349702358246,
"rewards/reward_composite/std": 0.17128486931324005,
"reward": 1.6010336875915527,
"reward_std": 1.725760817527771,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3876567706465721,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.48333333333333334,
"step": 58
},
{
"loss": 0.00037222355604171753,
"grad_norm": 0.703125,
"learning_rate": 4.229604321829561e-09,
"num_tokens": 407760.0,
"completions/mean_length": 400.0,
"completions/min_length": 400.0,
"completions/max_length": 400.0,
"completions/clipped_ratio": 1.0,
"completions/mean_terminated_length": 0.0,
"completions/min_terminated_length": 0.0,
"completions/max_terminated_length": 0.0,
"rewards/reward_format/mean": -0.46875,
"rewards/reward_format/std": 0.6302918791770935,
"rewards/reward_welfare/mean": 0.125,
"rewards/reward_welfare/std": 0.3535533845424652,
"rewards/reward_fairness/mean": 0.015625,
"rewards/reward_fairness/std": 0.04419417306780815,
"rewards/reward_stability/mean": 1.0,
"rewards/reward_stability/std": 0.0,
"rewards/reward_composite/mean": 0.018206155858933926,
"rewards/reward_composite/std": 0.05149478651583195,
"reward": 0.6900811493396759,
"reward_std": 0.873493492603302,
"frac_reward_zero_std": 0.0,
"completion_length": 400.0,
"kl": 0.3722131997346878,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/high_max": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 0.5,
"step": 60
},
{
"train_runtime": 1880.1016,
"train_samples_per_second": 0.255,
"train_steps_per_second": 0.032,
"total_flos": 0.0,
"train_loss": 0.0003991109939912955,
"epoch": 0.5,
"step": 60
}
]
}