{ "phase": "sft+grpo", "sft_history": [ { "loss": 1.8329996109008788, "grad_norm": 2.6284756660461426, "learning_rate": 2.6666666666666667e-05, "epoch": 0.16666666666666666, "step": 5 }, { "loss": 1.641743278503418, "grad_norm": 0.9074174761772156, "learning_rate": 6e-05, "epoch": 0.3333333333333333, "step": 10 }, { "loss": 1.3325251579284667, "grad_norm": 0.772527277469635, "learning_rate": 9.333333333333334e-05, "epoch": 0.5, "step": 15 }, { "loss": 0.908332347869873, "grad_norm": 0.8558230400085449, "learning_rate": 0.00012666666666666666, "epoch": 0.6666666666666666, "step": 20 }, { "loss": 0.4191232204437256, "grad_norm": 0.6383947134017944, "learning_rate": 0.00016, "epoch": 0.8333333333333334, "step": 25 }, { "loss": 0.20252063274383544, "grad_norm": 0.24536560475826263, "learning_rate": 0.00019333333333333333, "epoch": 1.0, "step": 30 }, { "loss": 0.1843562602996826, "grad_norm": 0.1841956526041031, "learning_rate": 0.0001913545457642601, "epoch": 1.1666666666666667, "step": 35 }, { "loss": 0.1743373155593872, "grad_norm": 0.12225674837827682, "learning_rate": 0.00015877852522924732, "epoch": 1.3333333333333333, "step": 40 }, { "loss": 0.1707882285118103, "grad_norm": 0.11675203591585159, "learning_rate": 0.00011045284632676536, "epoch": 1.5, "step": 45 }, { "loss": 0.17305984497070312, "grad_norm": 0.168966606259346, "learning_rate": 5.9326335692419995e-05, "epoch": 1.6666666666666665, "step": 50 }, { "loss": 0.1723298192024231, "grad_norm": 0.14092567563056946, "learning_rate": 1.9098300562505266e-05, "epoch": 1.8333333333333335, "step": 55 }, { "loss": 0.16860610246658325, "grad_norm": 0.13329552114009857, "learning_rate": 5.478104631726711e-07, "epoch": 2.0, "step": 60 }, { "train_runtime": 1079.0765, "train_samples_per_second": 1.112, "train_steps_per_second": 0.056, "total_flos": 5520149869086720.0, "train_loss": 0.6150601516167323, "epoch": 2.0, "step": 60 } ], "grpo_history": [ { "loss": 0.00047351792454719543, "grad_norm": 0.72265625, "learning_rate": 8.333333333333333e-07, "num_tokens": 13592.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.4375, "rewards/reward_format/std": 0.3535533770918846, "rewards/reward_welfare/mean": 0.0625, "rewards/reward_welfare/std": 0.1767766922712326, "rewards/reward_fairness/mean": 0.03318497911095619, "rewards/reward_fairness/std": 0.09386129677295685, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.02344914712011814, "rewards/reward_composite/std": 0.06632420420646667, "reward": 0.6816341280937195, "reward_std": 0.48826825618743896, "frac_reward_zero_std": 0.5, "completion_length": 400.0, "kl": 0.47351907938718796, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.016666666666666666, "step": 2 }, { "loss": 0.00041250139474868774, "grad_norm": 0.68359375, "learning_rate": 2.5e-06, "num_tokens": 26592.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.33522727340459824, "rewards/reward_format/std": 0.3089452385902405, "rewards/reward_welfare/mean": 0.125, "rewards/reward_welfare/std": 0.2314550280570984, "rewards/reward_fairness/mean": 0.037382133305072784, "rewards/reward_fairness/std": 0.07023922353982925, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.03359023481607437, "rewards/reward_composite/std": 0.06231452897191048, "reward": 0.8607450723648071, "reward_std": 0.4173068106174469, "frac_reward_zero_std": 0.75, "completion_length": 400.0, "kl": 0.4125128909945488, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03333333333333333, "step": 4 }, { "loss": 0.0003954425919800997, "grad_norm": 0.00665283203125, "learning_rate": 4.166666666666667e-06, "num_tokens": 39888.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.53125, "rewards/reward_format/std": 0.0883883461356163, "rewards/reward_welfare/mean": 0.0, "rewards/reward_welfare/std": 0.0, "rewards/reward_fairness/mean": 0.0, "rewards/reward_fairness/std": 0.0, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.0, "rewards/reward_composite/std": 0.0, "reward": 0.46875, "reward_std": 0.0625, "frac_reward_zero_std": 0.75, "completion_length": 400.0, "kl": 0.3954422175884247, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05, "step": 6 }, { "loss": 0.00040989843546412885, "grad_norm": 0.013671875, "learning_rate": 4.995770395678171e-06, "num_tokens": 53776.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.5, "rewards/reward_format/std": 0.0, "rewards/reward_welfare/mean": 0.0, "rewards/reward_welfare/std": 0.0, "rewards/reward_fairness/mean": 0.0, "rewards/reward_fairness/std": 0.0, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.0, "rewards/reward_composite/std": 0.0, "reward": 0.5, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 400.0, "kl": 0.40989840030670166, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06666666666666667, "step": 8 }, { "loss": 0.00042488425970077515, "grad_norm": 0.4921875, "learning_rate": 4.962019382530521e-06, "num_tokens": 67664.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.46875, "rewards/reward_format/std": 0.2651650384068489, "rewards/reward_welfare/mean": 0.0625, "rewards/reward_welfare/std": 0.1767766922712326, "rewards/reward_fairness/mean": 0.02351469174027443, "rewards/reward_fairness/std": 0.06650959700345993, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.023048987612128258, "rewards/reward_composite/std": 0.06519238650798798, "reward": 0.6403136849403381, "reward_std": 0.40562736988067627, "frac_reward_zero_std": 0.5, "completion_length": 400.0, "kl": 0.42487896233797073, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08333333333333333, "step": 10 }, { "loss": 0.0003492364485282451, "grad_norm": 0.703125, "learning_rate": 4.894973780788722e-06, "num_tokens": 81552.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.34659090638160706, "rewards/reward_format/std": 0.6987431943416595, "rewards/reward_welfare/mean": 0.1875, "rewards/reward_welfare/std": 0.408231720328331, "rewards/reward_fairness/mean": 0.060186946764588356, "rewards/reward_fairness/std": 0.13657810539007187, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.05276940576732159, "rewards/reward_composite/std": 0.11823124438524246, "reward": 0.9538654386997223, "reward_std": 1.2448847889900208, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.34924405813217163, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1, "step": 12 }, { "loss": 0.00039254588773474097, "grad_norm": 0.7890625, "learning_rate": 4.7955402672006855e-06, "num_tokens": 95440.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.2755681872367859, "rewards/reward_format/std": 0.5705045461654663, "rewards/reward_welfare/mean": 0.1875, "rewards/reward_welfare/std": 0.408231720328331, "rewards/reward_fairness/mean": 0.05876787751913071, "rewards/reward_fairness/std": 0.13999952003359795, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.051001692190766335, "rewards/reward_composite/std": 0.12243235111236572, "reward": 1.0217013657093048, "reward_std": 1.1684027314186096, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3925560265779495, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11666666666666667, "step": 14 }, { "loss": 0.00040383817395195365, "grad_norm": 0.86328125, "learning_rate": 4.665063509461098e-06, "num_tokens": 108736.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.16477272659540176, "rewards/reward_format/std": 0.6252594292163849, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.10286042466759682, "rewards/reward_fairness/std": 0.16851608455181122, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.09322065114974976, "rewards/reward_composite/std": 0.1486019790172577, "reward": 1.343808352947235, "reward_std": 1.4776567816734314, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.40385157614946365, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13333333333333333, "step": 16 }, { "loss": 0.0004179440438747406, "grad_norm": 0.65234375, "learning_rate": 4.50530798188761e-06, "num_tokens": 122624.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.34375, "rewards/reward_format/std": 0.5564062297344208, "rewards/reward_welfare/mean": 0.125, "rewards/reward_welfare/std": 0.3535533845424652, "rewards/reward_fairness/mean": 0.04079132154583931, "rewards/reward_fairness/std": 0.11537527851760387, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.035708064679056406, "rewards/reward_composite/std": 0.10099766962230206, "reward": 0.8577493727207184, "reward_std": 0.8404987752437592, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.41793932020664215, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15, "step": 18 }, { "loss": 0.00039753690361976624, "grad_norm": 0.75, "learning_rate": 4.318434103932622e-06, "num_tokens": 136512.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.35795454680919647, "rewards/reward_format/std": 0.6613655686378479, "rewards/reward_welfare/mean": 0.1875, "rewards/reward_welfare/std": 0.408231720328331, "rewards/reward_fairness/mean": 0.05811220221221447, "rewards/reward_fairness/std": 0.1433359570801258, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.04960842803120613, "rewards/reward_composite/std": 0.11969681829214096, "reward": 0.9372660517692566, "reward_std": 0.9138101935386658, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.39755555987358093, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16666666666666666, "step": 20 }, { "loss": 0.0003871597582474351, "grad_norm": 0.005157470703125, "learning_rate": 4.106969024216348e-06, "num_tokens": 150104.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.21875, "rewards/reward_format/std": 0.38816189765930176, "rewards/reward_welfare/mean": 0.1875, "rewards/reward_welfare/std": 0.25877460837364197, "rewards/reward_fairness/mean": 0.09772966802120209, "rewards/reward_fairness/std": 0.1411271095275879, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.07592316716909409, "rewards/reward_composite/std": 0.10568810254335403, "reward": 1.1424028873443604, "reward_std": 0.9167249202728271, "frac_reward_zero_std": 0.5, "completion_length": 400.0, "kl": 0.3871647119522095, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18333333333333332, "step": 22 }, { "loss": 0.00046034157276153564, "grad_norm": 0.828125, "learning_rate": 3.8737724451770155e-06, "num_tokens": 163992.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.053977273404598236, "rewards/reward_format/std": 0.6971071362495422, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.11436978727579117, "rewards/reward_fairness/std": 0.21275469660758972, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.08883418142795563, "rewards/reward_composite/std": 0.15858761221170425, "reward": 1.4617266654968262, "reward_std": 1.6014615297317505, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.46036188304424286, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2, "step": 24 }, { "loss": 0.00037954188883304596, "grad_norm": 0.75390625, "learning_rate": 3.621997950501156e-06, "num_tokens": 177584.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.11079545319080353, "rewards/reward_format/std": 0.7605703175067902, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.13740837946534157, "rewards/reward_fairness/std": 0.23384775966405869, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.1134110763669014, "rewards/reward_composite/std": 0.1934959888458252, "reward": 1.4525240659713745, "reward_std": 1.4633366465568542, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3795487657189369, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21666666666666667, "step": 26 }, { "loss": 0.00034568458795547485, "grad_norm": 0.67578125, "learning_rate": 3.3550503583141726e-06, "num_tokens": 190880.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.014204561710357666, "rewards/reward_format/std": 0.45640653371810913, "rewards/reward_welfare/mean": 0.375, "rewards/reward_welfare/std": 0.2314550280570984, "rewards/reward_fairness/mean": 0.17191734910011292, "rewards/reward_fairness/std": 0.165505051612854, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.12545911967754364, "rewards/reward_composite/std": 0.09392639249563217, "reward": 1.6581718921661377, "reward_std": 0.9023097902536392, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.3456726223230362, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23333333333333334, "step": 28 }, { "loss": 0.0004424452781677246, "grad_norm": 0.79296875, "learning_rate": 3.0765396768561005e-06, "num_tokens": 204472.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.05681818723678589, "rewards/reward_format/std": 0.8016891181468964, "rewards/reward_welfare/mean": 0.375, "rewards/reward_welfare/std": 0.49871626496315, "rewards/reward_fairness/mean": 0.11990131065249443, "rewards/reward_fairness/std": 0.19920051097869873, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.10109234228730202, "rewards/reward_composite/std": 0.15720761567354202, "reward": 1.5391755104064941, "reward_std": 1.311523675918579, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.44244876503944397, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25, "step": 30 }, { "loss": 0.00044108927249908447, "grad_norm": 0.79296875, "learning_rate": 2.7902322853130758e-06, "num_tokens": 218360.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.23011364042758942, "rewards/reward_format/std": 0.7126934230327606, "rewards/reward_welfare/mean": 0.25, "rewards/reward_welfare/std": 0.4355513006448746, "rewards/reward_fairness/mean": 0.09009831957519054, "rewards/reward_fairness/std": 0.17519650608301163, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.07756261341273785, "rewards/reward_composite/std": 0.1447325348854065, "reward": 1.1875473260879517, "reward_std": 1.3125466108322144, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.4411006420850754, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.26666666666666666, "step": 32 }, { "loss": 0.00043725594878196716, "grad_norm": 0.765625, "learning_rate": 2.5e-06, "num_tokens": 232248.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": 0.3238636404275894, "rewards/reward_format/std": 0.7694187164306641, "rewards/reward_welfare/mean": 0.5625, "rewards/reward_welfare/std": 0.5260358452796936, "rewards/reward_fairness/mean": 0.19406583905220032, "rewards/reward_fairness/std": 0.20889797061681747, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.1601117104291916, "rewards/reward_composite/std": 0.16504594683647156, "reward": 2.2405412197113037, "reward_std": 1.72732412815094, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.43723437190055847, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2833333333333333, "step": 34 }, { "loss": 0.0003281831741333008, "grad_norm": 0.66015625, "learning_rate": 2.2097677146869242e-06, "num_tokens": 245840.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.0625, "rewards/reward_format/std": 0.8398386240005493, "rewards/reward_welfare/mean": 0.375, "rewards/reward_welfare/std": 0.5175492167472839, "rewards/reward_fairness/mean": 0.208244688808918, "rewards/reward_fairness/std": 0.3558191955089569, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.12120818346738815, "rewards/reward_composite/std": 0.17605619877576828, "reward": 1.6419528722763062, "reward_std": 1.9059234857559204, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.32817772775888443, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3, "step": 36 }, { "loss": 0.0003703221445903182, "grad_norm": 0.65625, "learning_rate": 1.9234603231439e-06, "num_tokens": 258840.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.34375, "rewards/reward_format/std": 0.4355708882212639, "rewards/reward_welfare/mean": 0.125, "rewards/reward_welfare/std": 0.2314550280570984, "rewards/reward_fairness/mean": 0.032129574567079544, "rewards/reward_fairness/std": 0.06755802780389786, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.0333767905831337, "rewards/reward_composite/std": 0.07086637616157532, "reward": 0.8467563986778259, "reward_std": 0.8185127973556519, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.3703107312321663, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31666666666666665, "step": 38 }, { "loss": 0.0003897678107023239, "grad_norm": 0.76171875, "learning_rate": 1.6449496416858285e-06, "num_tokens": 272136.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": 0.07102272659540176, "rewards/reward_format/std": 0.8448813557624817, "rewards/reward_welfare/mean": 0.4375, "rewards/reward_welfare/std": 0.5260358452796936, "rewards/reward_fairness/mean": 0.13828522339463234, "rewards/reward_fairness/std": 0.19835777580738068, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.11603889241814613, "rewards/reward_composite/std": 0.15580761432647705, "reward": 1.762846827507019, "reward_std": 1.750555157661438, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.38978311419487, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3333333333333333, "step": 40 }, { "loss": 0.00036280229687690735, "grad_norm": 0.52734375, "learning_rate": 1.3780020494988447e-06, "num_tokens": 286024.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.21875, "rewards/reward_format/std": 0.7038120031356812, "rewards/reward_welfare/mean": 0.25, "rewards/reward_welfare/std": 0.4629100561141968, "rewards/reward_fairness/mean": 0.08049380034208298, "rewards/reward_fairness/std": 0.14985806494951248, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.06840312853455544, "rewards/reward_composite/std": 0.12792598456144333, "reward": 1.1801469326019287, "reward_std": 1.2528201341629028, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.36278442293405533, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.35, "step": 42 }, { "loss": 0.00041984766721725464, "grad_norm": 0.734375, "learning_rate": 1.1262275548229852e-06, "num_tokens": 299320.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.125, "rewards/reward_format/std": 0.7535476386547089, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.44403792917728424, "rewards/reward_fairness/mean": 0.08010485023260117, "rewards/reward_fairness/std": 0.13119615614414215, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.06627386063337326, "rewards/reward_composite/std": 0.0994122326374054, "reward": 1.333878755569458, "reward_std": 1.2175767719745636, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.4198339805006981, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.36666666666666664, "step": 44 }, { "loss": 0.00037025846540927887, "grad_norm": 0.74609375, "learning_rate": 8.930309757836517e-07, "num_tokens": 312912.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.15625, "rewards/reward_format/std": 0.7469770908355713, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.44403792917728424, "rewards/reward_fairness/mean": 0.0987030416727066, "rewards/reward_fairness/std": 0.13824082165956497, "rewards/reward_stability/mean": 0.9375, "rewards/reward_stability/std": 0.1767766922712326, "rewards/reward_composite/mean": 0.0686455499380827, "rewards/reward_composite/std": 0.11963466554880142, "reward": 1.2610985934734344, "reward_std": 1.1380138397216797, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.3702595606446266, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.38333333333333336, "step": 46 }, { "loss": 0.00037697795778512955, "grad_norm": 0.68359375, "learning_rate": 6.815658960673782e-07, "num_tokens": 326208.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": 0.07954545319080353, "rewards/reward_format/std": 0.7752929627895355, "rewards/reward_welfare/mean": 0.4375, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.1642819568514824, "rewards/reward_fairness/std": 0.2405308187007904, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.12362180650234222, "rewards/reward_composite/std": 0.16839426010847092, "reward": 1.8049492835998535, "reward_std": 1.6678152084350586, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3769652917981148, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4, "step": 48 }, { "loss": 0.00039646029472351074, "grad_norm": 0.6953125, "learning_rate": 4.946920181123904e-07, "num_tokens": 339800.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.06818181276321411, "rewards/reward_format/std": 0.7570639848709106, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.12952633947134018, "rewards/reward_fairness/std": 0.2085839882493019, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.10791970416903496, "rewards/reward_composite/std": 0.1719568744301796, "reward": 1.4817642569541931, "reward_std": 1.4108701944351196, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.39645931124687195, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4166666666666667, "step": 50 }, { "loss": 0.000441722571849823, "grad_norm": 0.7265625, "learning_rate": 3.3493649053890325e-07, "num_tokens": 353688.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": 0.03125, "rewards/reward_format/std": 0.8107390403747559, "rewards/reward_welfare/mean": 0.375, "rewards/reward_welfare/std": 0.5175492167472839, "rewards/reward_fairness/mean": 0.10219378396868706, "rewards/reward_fairness/std": 0.1520508974790573, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.09825712814927101, "rewards/reward_composite/std": 0.1404884159564972, "reward": 1.6067009568214417, "reward_std": 1.2484731674194336, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.44170165807008743, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.43333333333333335, "step": 52 }, { "loss": 0.00037848297506570816, "grad_norm": 0.796875, "learning_rate": 2.044597327993153e-07, "num_tokens": 367280.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": 0.23579545319080353, "rewards/reward_format/std": 0.7612137198448181, "rewards/reward_welfare/mean": 0.5625, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.2322249710559845, "rewards/reward_fairness/std": 0.20398348569869995, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.19660773873329163, "rewards/reward_composite/std": 0.17612425237894058, "reward": 2.2271281480789185, "reward_std": 1.4199119210243225, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3786723464727402, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.45, "step": 54 }, { "loss": 0.00040780752897262573, "grad_norm": 0.65234375, "learning_rate": 1.0502621921127776e-07, "num_tokens": 380872.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.0625, "rewards/reward_format/std": 0.7646470665931702, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.09930047020316124, "rewards/reward_fairness/std": 0.18247877806425095, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.08683200180530548, "rewards/reward_composite/std": 0.15039421617984772, "reward": 1.4361324906349182, "reward_std": 1.6369856595993042, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.4077882617712021, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4666666666666667, "step": 56 }, { "loss": 0.00038760900497436523, "grad_norm": 0.74609375, "learning_rate": 3.798061746947995e-08, "num_tokens": 394168.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.002840910106897354, "rewards/reward_format/std": 0.8362354636192322, "rewards/reward_welfare/mean": 0.375, "rewards/reward_welfare/std": 0.5175492167472839, "rewards/reward_fairness/mean": 0.11489119380712509, "rewards/reward_fairness/std": 0.17957812547683716, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.11398349702358246, "rewards/reward_composite/std": 0.17128486931324005, "reward": 1.6010336875915527, "reward_std": 1.725760817527771, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3876567706465721, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.48333333333333334, "step": 58 }, { "loss": 0.00037222355604171753, "grad_norm": 0.703125, "learning_rate": 4.229604321829561e-09, "num_tokens": 407760.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.46875, "rewards/reward_format/std": 0.6302918791770935, "rewards/reward_welfare/mean": 0.125, "rewards/reward_welfare/std": 0.3535533845424652, "rewards/reward_fairness/mean": 0.015625, "rewards/reward_fairness/std": 0.04419417306780815, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.018206155858933926, "rewards/reward_composite/std": 0.05149478651583195, "reward": 0.6900811493396759, "reward_std": 0.873493492603302, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3722131997346878, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5, "step": 60 }, { "train_runtime": 1880.1016, "train_samples_per_second": 0.255, "train_steps_per_second": 0.032, "total_flos": 0.0, "train_loss": 0.0003991109939912955, "epoch": 0.5, "step": 60 } ], "history": [ { "loss": 1.8329996109008788, "grad_norm": 2.6284756660461426, "learning_rate": 2.6666666666666667e-05, "epoch": 0.16666666666666666, "step": 5 }, { "loss": 1.641743278503418, "grad_norm": 0.9074174761772156, "learning_rate": 6e-05, "epoch": 0.3333333333333333, "step": 10 }, { "loss": 1.3325251579284667, "grad_norm": 0.772527277469635, "learning_rate": 9.333333333333334e-05, "epoch": 0.5, "step": 15 }, { "loss": 0.908332347869873, "grad_norm": 0.8558230400085449, "learning_rate": 0.00012666666666666666, "epoch": 0.6666666666666666, "step": 20 }, { "loss": 0.4191232204437256, "grad_norm": 0.6383947134017944, "learning_rate": 0.00016, "epoch": 0.8333333333333334, "step": 25 }, { "loss": 0.20252063274383544, "grad_norm": 0.24536560475826263, "learning_rate": 0.00019333333333333333, "epoch": 1.0, "step": 30 }, { "loss": 0.1843562602996826, "grad_norm": 0.1841956526041031, "learning_rate": 0.0001913545457642601, "epoch": 1.1666666666666667, "step": 35 }, { "loss": 0.1743373155593872, "grad_norm": 0.12225674837827682, "learning_rate": 0.00015877852522924732, "epoch": 1.3333333333333333, "step": 40 }, { "loss": 0.1707882285118103, "grad_norm": 0.11675203591585159, "learning_rate": 0.00011045284632676536, "epoch": 1.5, "step": 45 }, { "loss": 0.17305984497070312, "grad_norm": 0.168966606259346, "learning_rate": 5.9326335692419995e-05, "epoch": 1.6666666666666665, "step": 50 }, { "loss": 0.1723298192024231, "grad_norm": 0.14092567563056946, "learning_rate": 1.9098300562505266e-05, "epoch": 1.8333333333333335, "step": 55 }, { "loss": 0.16860610246658325, "grad_norm": 0.13329552114009857, "learning_rate": 5.478104631726711e-07, "epoch": 2.0, "step": 60 }, { "train_runtime": 1079.0765, "train_samples_per_second": 1.112, "train_steps_per_second": 0.056, "total_flos": 5520149869086720.0, "train_loss": 0.6150601516167323, "epoch": 2.0, "step": 60 }, { "loss": 0.00047351792454719543, "grad_norm": 0.72265625, "learning_rate": 8.333333333333333e-07, "num_tokens": 13592.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.4375, "rewards/reward_format/std": 0.3535533770918846, "rewards/reward_welfare/mean": 0.0625, "rewards/reward_welfare/std": 0.1767766922712326, "rewards/reward_fairness/mean": 0.03318497911095619, "rewards/reward_fairness/std": 0.09386129677295685, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.02344914712011814, "rewards/reward_composite/std": 0.06632420420646667, "reward": 0.6816341280937195, "reward_std": 0.48826825618743896, "frac_reward_zero_std": 0.5, "completion_length": 400.0, "kl": 0.47351907938718796, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.016666666666666666, "step": 2 }, { "loss": 0.00041250139474868774, "grad_norm": 0.68359375, "learning_rate": 2.5e-06, "num_tokens": 26592.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.33522727340459824, "rewards/reward_format/std": 0.3089452385902405, "rewards/reward_welfare/mean": 0.125, "rewards/reward_welfare/std": 0.2314550280570984, "rewards/reward_fairness/mean": 0.037382133305072784, "rewards/reward_fairness/std": 0.07023922353982925, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.03359023481607437, "rewards/reward_composite/std": 0.06231452897191048, "reward": 0.8607450723648071, "reward_std": 0.4173068106174469, "frac_reward_zero_std": 0.75, "completion_length": 400.0, "kl": 0.4125128909945488, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.03333333333333333, "step": 4 }, { "loss": 0.0003954425919800997, "grad_norm": 0.00665283203125, "learning_rate": 4.166666666666667e-06, "num_tokens": 39888.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.53125, "rewards/reward_format/std": 0.0883883461356163, "rewards/reward_welfare/mean": 0.0, "rewards/reward_welfare/std": 0.0, "rewards/reward_fairness/mean": 0.0, "rewards/reward_fairness/std": 0.0, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.0, "rewards/reward_composite/std": 0.0, "reward": 0.46875, "reward_std": 0.0625, "frac_reward_zero_std": 0.75, "completion_length": 400.0, "kl": 0.3954422175884247, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05, "step": 6 }, { "loss": 0.00040989843546412885, "grad_norm": 0.013671875, "learning_rate": 4.995770395678171e-06, "num_tokens": 53776.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.5, "rewards/reward_format/std": 0.0, "rewards/reward_welfare/mean": 0.0, "rewards/reward_welfare/std": 0.0, "rewards/reward_fairness/mean": 0.0, "rewards/reward_fairness/std": 0.0, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.0, "rewards/reward_composite/std": 0.0, "reward": 0.5, "reward_std": 0.0, "frac_reward_zero_std": 1.0, "completion_length": 400.0, "kl": 0.40989840030670166, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.06666666666666667, "step": 8 }, { "loss": 0.00042488425970077515, "grad_norm": 0.4921875, "learning_rate": 4.962019382530521e-06, "num_tokens": 67664.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.46875, "rewards/reward_format/std": 0.2651650384068489, "rewards/reward_welfare/mean": 0.0625, "rewards/reward_welfare/std": 0.1767766922712326, "rewards/reward_fairness/mean": 0.02351469174027443, "rewards/reward_fairness/std": 0.06650959700345993, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.023048987612128258, "rewards/reward_composite/std": 0.06519238650798798, "reward": 0.6403136849403381, "reward_std": 0.40562736988067627, "frac_reward_zero_std": 0.5, "completion_length": 400.0, "kl": 0.42487896233797073, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.08333333333333333, "step": 10 }, { "loss": 0.0003492364485282451, "grad_norm": 0.703125, "learning_rate": 4.894973780788722e-06, "num_tokens": 81552.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.34659090638160706, "rewards/reward_format/std": 0.6987431943416595, "rewards/reward_welfare/mean": 0.1875, "rewards/reward_welfare/std": 0.408231720328331, "rewards/reward_fairness/mean": 0.060186946764588356, "rewards/reward_fairness/std": 0.13657810539007187, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.05276940576732159, "rewards/reward_composite/std": 0.11823124438524246, "reward": 0.9538654386997223, "reward_std": 1.2448847889900208, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.34924405813217163, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1, "step": 12 }, { "loss": 0.00039254588773474097, "grad_norm": 0.7890625, "learning_rate": 4.7955402672006855e-06, "num_tokens": 95440.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.2755681872367859, "rewards/reward_format/std": 0.5705045461654663, "rewards/reward_welfare/mean": 0.1875, "rewards/reward_welfare/std": 0.408231720328331, "rewards/reward_fairness/mean": 0.05876787751913071, "rewards/reward_fairness/std": 0.13999952003359795, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.051001692190766335, "rewards/reward_composite/std": 0.12243235111236572, "reward": 1.0217013657093048, "reward_std": 1.1684027314186096, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3925560265779495, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.11666666666666667, "step": 14 }, { "loss": 0.00040383817395195365, "grad_norm": 0.86328125, "learning_rate": 4.665063509461098e-06, "num_tokens": 108736.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.16477272659540176, "rewards/reward_format/std": 0.6252594292163849, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.10286042466759682, "rewards/reward_fairness/std": 0.16851608455181122, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.09322065114974976, "rewards/reward_composite/std": 0.1486019790172577, "reward": 1.343808352947235, "reward_std": 1.4776567816734314, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.40385157614946365, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.13333333333333333, "step": 16 }, { "loss": 0.0004179440438747406, "grad_norm": 0.65234375, "learning_rate": 4.50530798188761e-06, "num_tokens": 122624.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.34375, "rewards/reward_format/std": 0.5564062297344208, "rewards/reward_welfare/mean": 0.125, "rewards/reward_welfare/std": 0.3535533845424652, "rewards/reward_fairness/mean": 0.04079132154583931, "rewards/reward_fairness/std": 0.11537527851760387, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.035708064679056406, "rewards/reward_composite/std": 0.10099766962230206, "reward": 0.8577493727207184, "reward_std": 0.8404987752437592, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.41793932020664215, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.15, "step": 18 }, { "loss": 0.00039753690361976624, "grad_norm": 0.75, "learning_rate": 4.318434103932622e-06, "num_tokens": 136512.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.35795454680919647, "rewards/reward_format/std": 0.6613655686378479, "rewards/reward_welfare/mean": 0.1875, "rewards/reward_welfare/std": 0.408231720328331, "rewards/reward_fairness/mean": 0.05811220221221447, "rewards/reward_fairness/std": 0.1433359570801258, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.04960842803120613, "rewards/reward_composite/std": 0.11969681829214096, "reward": 0.9372660517692566, "reward_std": 0.9138101935386658, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.39755555987358093, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.16666666666666666, "step": 20 }, { "loss": 0.0003871597582474351, "grad_norm": 0.005157470703125, "learning_rate": 4.106969024216348e-06, "num_tokens": 150104.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.21875, "rewards/reward_format/std": 0.38816189765930176, "rewards/reward_welfare/mean": 0.1875, "rewards/reward_welfare/std": 0.25877460837364197, "rewards/reward_fairness/mean": 0.09772966802120209, "rewards/reward_fairness/std": 0.1411271095275879, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.07592316716909409, "rewards/reward_composite/std": 0.10568810254335403, "reward": 1.1424028873443604, "reward_std": 0.9167249202728271, "frac_reward_zero_std": 0.5, "completion_length": 400.0, "kl": 0.3871647119522095, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.18333333333333332, "step": 22 }, { "loss": 0.00046034157276153564, "grad_norm": 0.828125, "learning_rate": 3.8737724451770155e-06, "num_tokens": 163992.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.053977273404598236, "rewards/reward_format/std": 0.6971071362495422, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.11436978727579117, "rewards/reward_fairness/std": 0.21275469660758972, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.08883418142795563, "rewards/reward_composite/std": 0.15858761221170425, "reward": 1.4617266654968262, "reward_std": 1.6014615297317505, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.46036188304424286, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2, "step": 24 }, { "loss": 0.00037954188883304596, "grad_norm": 0.75390625, "learning_rate": 3.621997950501156e-06, "num_tokens": 177584.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.11079545319080353, "rewards/reward_format/std": 0.7605703175067902, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.13740837946534157, "rewards/reward_fairness/std": 0.23384775966405869, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.1134110763669014, "rewards/reward_composite/std": 0.1934959888458252, "reward": 1.4525240659713745, "reward_std": 1.4633366465568542, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3795487657189369, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.21666666666666667, "step": 26 }, { "loss": 0.00034568458795547485, "grad_norm": 0.67578125, "learning_rate": 3.3550503583141726e-06, "num_tokens": 190880.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.014204561710357666, "rewards/reward_format/std": 0.45640653371810913, "rewards/reward_welfare/mean": 0.375, "rewards/reward_welfare/std": 0.2314550280570984, "rewards/reward_fairness/mean": 0.17191734910011292, "rewards/reward_fairness/std": 0.165505051612854, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.12545911967754364, "rewards/reward_composite/std": 0.09392639249563217, "reward": 1.6581718921661377, "reward_std": 0.9023097902536392, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.3456726223230362, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.23333333333333334, "step": 28 }, { "loss": 0.0004424452781677246, "grad_norm": 0.79296875, "learning_rate": 3.0765396768561005e-06, "num_tokens": 204472.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.05681818723678589, "rewards/reward_format/std": 0.8016891181468964, "rewards/reward_welfare/mean": 0.375, "rewards/reward_welfare/std": 0.49871626496315, "rewards/reward_fairness/mean": 0.11990131065249443, "rewards/reward_fairness/std": 0.19920051097869873, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.10109234228730202, "rewards/reward_composite/std": 0.15720761567354202, "reward": 1.5391755104064941, "reward_std": 1.311523675918579, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.44244876503944397, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.25, "step": 30 }, { "loss": 0.00044108927249908447, "grad_norm": 0.79296875, "learning_rate": 2.7902322853130758e-06, "num_tokens": 218360.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.23011364042758942, "rewards/reward_format/std": 0.7126934230327606, "rewards/reward_welfare/mean": 0.25, "rewards/reward_welfare/std": 0.4355513006448746, "rewards/reward_fairness/mean": 0.09009831957519054, "rewards/reward_fairness/std": 0.17519650608301163, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.07756261341273785, "rewards/reward_composite/std": 0.1447325348854065, "reward": 1.1875473260879517, "reward_std": 1.3125466108322144, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.4411006420850754, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.26666666666666666, "step": 32 }, { "loss": 0.00043725594878196716, "grad_norm": 0.765625, "learning_rate": 2.5e-06, "num_tokens": 232248.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": 0.3238636404275894, "rewards/reward_format/std": 0.7694187164306641, "rewards/reward_welfare/mean": 0.5625, "rewards/reward_welfare/std": 0.5260358452796936, "rewards/reward_fairness/mean": 0.19406583905220032, "rewards/reward_fairness/std": 0.20889797061681747, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.1601117104291916, "rewards/reward_composite/std": 0.16504594683647156, "reward": 2.2405412197113037, "reward_std": 1.72732412815094, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.43723437190055847, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.2833333333333333, "step": 34 }, { "loss": 0.0003281831741333008, "grad_norm": 0.66015625, "learning_rate": 2.2097677146869242e-06, "num_tokens": 245840.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.0625, "rewards/reward_format/std": 0.8398386240005493, "rewards/reward_welfare/mean": 0.375, "rewards/reward_welfare/std": 0.5175492167472839, "rewards/reward_fairness/mean": 0.208244688808918, "rewards/reward_fairness/std": 0.3558191955089569, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.12120818346738815, "rewards/reward_composite/std": 0.17605619877576828, "reward": 1.6419528722763062, "reward_std": 1.9059234857559204, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.32817772775888443, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3, "step": 36 }, { "loss": 0.0003703221445903182, "grad_norm": 0.65625, "learning_rate": 1.9234603231439e-06, "num_tokens": 258840.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.34375, "rewards/reward_format/std": 0.4355708882212639, "rewards/reward_welfare/mean": 0.125, "rewards/reward_welfare/std": 0.2314550280570984, "rewards/reward_fairness/mean": 0.032129574567079544, "rewards/reward_fairness/std": 0.06755802780389786, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.0333767905831337, "rewards/reward_composite/std": 0.07086637616157532, "reward": 0.8467563986778259, "reward_std": 0.8185127973556519, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.3703107312321663, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.31666666666666665, "step": 38 }, { "loss": 0.0003897678107023239, "grad_norm": 0.76171875, "learning_rate": 1.6449496416858285e-06, "num_tokens": 272136.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": 0.07102272659540176, "rewards/reward_format/std": 0.8448813557624817, "rewards/reward_welfare/mean": 0.4375, "rewards/reward_welfare/std": 0.5260358452796936, "rewards/reward_fairness/mean": 0.13828522339463234, "rewards/reward_fairness/std": 0.19835777580738068, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.11603889241814613, "rewards/reward_composite/std": 0.15580761432647705, "reward": 1.762846827507019, "reward_std": 1.750555157661438, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.38978311419487, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.3333333333333333, "step": 40 }, { "loss": 0.00036280229687690735, "grad_norm": 0.52734375, "learning_rate": 1.3780020494988447e-06, "num_tokens": 286024.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.21875, "rewards/reward_format/std": 0.7038120031356812, "rewards/reward_welfare/mean": 0.25, "rewards/reward_welfare/std": 0.4629100561141968, "rewards/reward_fairness/mean": 0.08049380034208298, "rewards/reward_fairness/std": 0.14985806494951248, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.06840312853455544, "rewards/reward_composite/std": 0.12792598456144333, "reward": 1.1801469326019287, "reward_std": 1.2528201341629028, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.36278442293405533, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.35, "step": 42 }, { "loss": 0.00041984766721725464, "grad_norm": 0.734375, "learning_rate": 1.1262275548229852e-06, "num_tokens": 299320.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.125, "rewards/reward_format/std": 0.7535476386547089, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.44403792917728424, "rewards/reward_fairness/mean": 0.08010485023260117, "rewards/reward_fairness/std": 0.13119615614414215, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.06627386063337326, "rewards/reward_composite/std": 0.0994122326374054, "reward": 1.333878755569458, "reward_std": 1.2175767719745636, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.4198339805006981, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.36666666666666664, "step": 44 }, { "loss": 0.00037025846540927887, "grad_norm": 0.74609375, "learning_rate": 8.930309757836517e-07, "num_tokens": 312912.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.15625, "rewards/reward_format/std": 0.7469770908355713, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.44403792917728424, "rewards/reward_fairness/mean": 0.0987030416727066, "rewards/reward_fairness/std": 0.13824082165956497, "rewards/reward_stability/mean": 0.9375, "rewards/reward_stability/std": 0.1767766922712326, "rewards/reward_composite/mean": 0.0686455499380827, "rewards/reward_composite/std": 0.11963466554880142, "reward": 1.2610985934734344, "reward_std": 1.1380138397216797, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.3702595606446266, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.38333333333333336, "step": 46 }, { "loss": 0.00037697795778512955, "grad_norm": 0.68359375, "learning_rate": 6.815658960673782e-07, "num_tokens": 326208.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": 0.07954545319080353, "rewards/reward_format/std": 0.7752929627895355, "rewards/reward_welfare/mean": 0.4375, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.1642819568514824, "rewards/reward_fairness/std": 0.2405308187007904, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.12362180650234222, "rewards/reward_composite/std": 0.16839426010847092, "reward": 1.8049492835998535, "reward_std": 1.6678152084350586, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3769652917981148, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4, "step": 48 }, { "loss": 0.00039646029472351074, "grad_norm": 0.6953125, "learning_rate": 4.946920181123904e-07, "num_tokens": 339800.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.06818181276321411, "rewards/reward_format/std": 0.7570639848709106, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.12952633947134018, "rewards/reward_fairness/std": 0.2085839882493019, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.10791970416903496, "rewards/reward_composite/std": 0.1719568744301796, "reward": 1.4817642569541931, "reward_std": 1.4108701944351196, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.39645931124687195, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4166666666666667, "step": 50 }, { "loss": 0.000441722571849823, "grad_norm": 0.7265625, "learning_rate": 3.3493649053890325e-07, "num_tokens": 353688.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": 0.03125, "rewards/reward_format/std": 0.8107390403747559, "rewards/reward_welfare/mean": 0.375, "rewards/reward_welfare/std": 0.5175492167472839, "rewards/reward_fairness/mean": 0.10219378396868706, "rewards/reward_fairness/std": 0.1520508974790573, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.09825712814927101, "rewards/reward_composite/std": 0.1404884159564972, "reward": 1.6067009568214417, "reward_std": 1.2484731674194336, "frac_reward_zero_std": 0.25, "completion_length": 400.0, "kl": 0.44170165807008743, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.43333333333333335, "step": 52 }, { "loss": 0.00037848297506570816, "grad_norm": 0.796875, "learning_rate": 2.044597327993153e-07, "num_tokens": 367280.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": 0.23579545319080353, "rewards/reward_format/std": 0.7612137198448181, "rewards/reward_welfare/mean": 0.5625, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.2322249710559845, "rewards/reward_fairness/std": 0.20398348569869995, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.19660773873329163, "rewards/reward_composite/std": 0.17612425237894058, "reward": 2.2271281480789185, "reward_std": 1.4199119210243225, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3786723464727402, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.45, "step": 54 }, { "loss": 0.00040780752897262573, "grad_norm": 0.65234375, "learning_rate": 1.0502621921127776e-07, "num_tokens": 380872.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.0625, "rewards/reward_format/std": 0.7646470665931702, "rewards/reward_welfare/mean": 0.3125, "rewards/reward_welfare/std": 0.49022963643074036, "rewards/reward_fairness/mean": 0.09930047020316124, "rewards/reward_fairness/std": 0.18247877806425095, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.08683200180530548, "rewards/reward_composite/std": 0.15039421617984772, "reward": 1.4361324906349182, "reward_std": 1.6369856595993042, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.4077882617712021, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.4666666666666667, "step": 56 }, { "loss": 0.00038760900497436523, "grad_norm": 0.74609375, "learning_rate": 3.798061746947995e-08, "num_tokens": 394168.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.002840910106897354, "rewards/reward_format/std": 0.8362354636192322, "rewards/reward_welfare/mean": 0.375, "rewards/reward_welfare/std": 0.5175492167472839, "rewards/reward_fairness/mean": 0.11489119380712509, "rewards/reward_fairness/std": 0.17957812547683716, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.11398349702358246, "rewards/reward_composite/std": 0.17128486931324005, "reward": 1.6010336875915527, "reward_std": 1.725760817527771, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3876567706465721, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.48333333333333334, "step": 58 }, { "loss": 0.00037222355604171753, "grad_norm": 0.703125, "learning_rate": 4.229604321829561e-09, "num_tokens": 407760.0, "completions/mean_length": 400.0, "completions/min_length": 400.0, "completions/max_length": 400.0, "completions/clipped_ratio": 1.0, "completions/mean_terminated_length": 0.0, "completions/min_terminated_length": 0.0, "completions/max_terminated_length": 0.0, "rewards/reward_format/mean": -0.46875, "rewards/reward_format/std": 0.6302918791770935, "rewards/reward_welfare/mean": 0.125, "rewards/reward_welfare/std": 0.3535533845424652, "rewards/reward_fairness/mean": 0.015625, "rewards/reward_fairness/std": 0.04419417306780815, "rewards/reward_stability/mean": 1.0, "rewards/reward_stability/std": 0.0, "rewards/reward_composite/mean": 0.018206155858933926, "rewards/reward_composite/std": 0.05149478651583195, "reward": 0.6900811493396759, "reward_std": 0.873493492603302, "frac_reward_zero_std": 0.0, "completion_length": 400.0, "kl": 0.3722131997346878, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.5, "step": 60 }, { "train_runtime": 1880.1016, "train_samples_per_second": 0.255, "train_steps_per_second": 0.032, "total_flos": 0.0, "train_loss": 0.0003991109939912955, "epoch": 0.5, "step": 60 } ] }