{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06827686267815994, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3556.0, "completions/mean_length": 2988.1640625, "completions/mean_terminated_length": 2171.648193359375, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.0006827686267815994, "frac_reward_zero_std": 0.71875, "grad_norm": 4.203740119934082, "kl": 0.0044651031494140625, "learning_rate": 0.0, "loss": 0.0223, "num_tokens": 402745.0, "reward": 0.000972747802734375, "reward_std": 0.17136572301387787, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3984375, "rewards/tag_count_reward/std": 0.19950109720230103, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3405.0, "completions/mean_length": 2921.2265625, "completions/mean_terminated_length": 1983.339599609375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.0013655372535631989, "frac_reward_zero_std": 0.765625, "grad_norm": 0.20063242316246033, "kl": 0.0014462471008300781, "learning_rate": 1.360544217687075e-07, "loss": 0.0107, "num_tokens": 798226.0, "reward": 0.000766754150390625, "reward_std": 0.2209223061800003, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.16596287488937378, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3401.0, "completions/mean_length": 2887.2734375, "completions/mean_terminated_length": 1800.3800048828125, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "epoch": 0.002048305880344798, "frac_reward_zero_std": 0.703125, "grad_norm": 0.24004808068275452, "kl": 0.0014967918395996094, "learning_rate": 2.72108843537415e-07, "loss": 0.0033, "num_tokens": 1190155.0, "reward": 0.0005645751953125, "reward_std": 0.1685226559638977, "rewards/accuracy_reward/mean": 0.1190476194024086, "rewards/accuracy_reward/std": 0.32513731718063354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.16599762439727783, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3484.0, "completions/mean_length": 2974.3671875, "completions/mean_terminated_length": 1958.3125, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 0.0027310745071263977, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2399013191461563, "kl": 0.0014467239379882812, "learning_rate": 4.0816326530612243e-07, "loss": 0.0257, "num_tokens": 1590790.0, "reward": 0.00032806396484375, "reward_std": 0.15471197664737701, "rewards/accuracy_reward/mean": 0.15079365670681, "rewards/accuracy_reward/std": 0.3592761754989624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.15666775405406952, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 2984.2265625, "completions/mean_terminated_length": 1950.574462890625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.003413843133907997, "frac_reward_zero_std": 0.75, "grad_norm": 0.19607630372047424, "kl": 0.0014853477478027344, "learning_rate": 5.4421768707483e-07, "loss": 0.0151, "num_tokens": 1992267.0, "reward": 0.00040435791015625, "reward_std": 0.09119363874197006, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.15034358203411102, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3519.0, "completions/mean_length": 3025.21875, "completions/mean_terminated_length": 2153.52001953125, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.004096611760689596, "frac_reward_zero_std": 0.6875, "grad_norm": 0.43831193447113037, "kl": 0.0015444755554199219, "learning_rate": 6.802721088435376e-07, "loss": 0.0183, "num_tokens": 2399145.0, "reward": 0.000579833984375, "reward_std": 0.16569578647613525, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.15977807343006134, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3453.0, "completions/mean_length": 2941.140625, "completions/mean_terminated_length": 1970.549072265625, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "epoch": 0.0047793803874711955, "frac_reward_zero_std": 0.734375, "grad_norm": 0.19637395441532135, "kl": 0.0014371871948242188, "learning_rate": 8.163265306122449e-07, "loss": 0.0095, "num_tokens": 2796025.0, "reward": 0.000213623046875, "reward_std": 0.24299243092536926, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.15681491792201996, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3488.0, "completions/mean_length": 2891.265625, "completions/mean_terminated_length": 1774.4080810546875, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "epoch": 0.0054621490142527955, "frac_reward_zero_std": 0.75, "grad_norm": 0.31975099444389343, "kl": 0.0015692710876464844, "learning_rate": 9.523809523809525e-07, "loss": 0.0022, "num_tokens": 3186427.0, "reward": 0.00048828125, "reward_std": 0.16845792531967163, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.1900155246257782, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3512.0, "completions/mean_length": 3039.7265625, "completions/mean_terminated_length": 2035.844482421875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.006144917641034395, "frac_reward_zero_std": 0.84375, "grad_norm": 0.16746875643730164, "kl": 0.0014495849609375, "learning_rate": 1.08843537414966e-06, "loss": -0.003, "num_tokens": 3596628.0, "reward": -0.000152587890625, "reward_std": 0.11882579326629639, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.384765625, "rewards/tag_count_reward/std": 0.20787248015403748, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3557.0, "completions/mean_length": 2994.0859375, "completions/mean_terminated_length": 1827.9766845703125, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 0.006827686267815994, "frac_reward_zero_std": 0.765625, "grad_norm": 0.1904914826154709, "kl": 0.0014672279357910156, "learning_rate": 1.2244897959183673e-06, "loss": 0.0086, "num_tokens": 4000471.0, "reward": -0.0001983642578125, "reward_std": 0.1381067931652069, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.1854381412267685, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3538.0, "completions/mean_length": 2969.9296875, "completions/mean_terminated_length": 2042.803955078125, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "epoch": 0.007510454894597594, "frac_reward_zero_std": 0.8125, "grad_norm": 0.1689022183418274, "kl": 0.0014200210571289062, "learning_rate": 1.3605442176870751e-06, "loss": 0.0083, "num_tokens": 4401436.0, "reward": -0.00022125244140625, "reward_std": 0.12142608314752579, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.18276500701904297, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 3060.8046875, "completions/mean_terminated_length": 2026.5814208984375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.008193223521379193, "frac_reward_zero_std": 0.8125, "grad_norm": 0.17076706886291504, "kl": 0.0014834403991699219, "learning_rate": 1.4965986394557825e-06, "loss": -0.0041, "num_tokens": 4813407.0, "reward": 0.00043487548828125, "reward_std": 0.08559384196996689, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.18822652101516724, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3553.0, "completions/mean_length": 2834.46875, "completions/mean_terminated_length": 1870.7857666015625, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.008875992148160792, "frac_reward_zero_std": 0.65625, "grad_norm": 0.27014032006263733, "kl": 0.0014667510986328125, "learning_rate": 1.6326530612244897e-06, "loss": -0.0018, "num_tokens": 5196953.0, "reward": 0.0006103515625, "reward_std": 0.15209010243415833, "rewards/accuracy_reward/mean": 0.1269841343164444, "rewards/accuracy_reward/std": 0.33428436517715454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.423828125, "rewards/tag_count_reward/std": 0.23594047129154205, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3376.0, "completions/mean_length": 2807.625, "completions/mean_terminated_length": 1809.4287109375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.009558760774942391, "frac_reward_zero_std": 0.796875, "grad_norm": 0.19507473707199097, "kl": 0.0016207695007324219, "learning_rate": 1.7687074829931975e-06, "loss": 0.0108, "num_tokens": 5577405.0, "reward": 0.000377655029296875, "reward_std": 0.13256633281707764, "rewards/accuracy_reward/mean": 0.0476190485060215, "rewards/accuracy_reward/std": 0.21380899846553802, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4140625, "rewards/tag_count_reward/std": 0.22058774530887604, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 2901.5234375, "completions/mean_terminated_length": 2077.8447265625, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 0.01024152940172399, "frac_reward_zero_std": 0.625, "grad_norm": 0.267034649848938, "kl": 0.0017647743225097656, "learning_rate": 1.904761904761905e-06, "loss": 0.0057, "num_tokens": 5969484.0, "reward": 0.0006866455078125, "reward_std": 0.19339267909526825, "rewards/accuracy_reward/mean": 0.15079365670681, "rewards/accuracy_reward/std": 0.3592761754989624, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.24097897112369537, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3358.0, "completions/mean_length": 2898.7734375, "completions/mean_terminated_length": 1929.11328125, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 0.010924298028505591, "frac_reward_zero_std": 0.828125, "grad_norm": 67.3929672241211, "kl": 0.023897171020507812, "learning_rate": 2.0408163265306125e-06, "loss": -0.0042, "num_tokens": 6361333.0, "reward": 8.392333984375e-05, "reward_std": 0.09658843278884888, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.451171875, "rewards/tag_count_reward/std": 0.2736070454120636, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3461.0, "completions/mean_length": 3003.4921875, "completions/mean_terminated_length": 1968.6739501953125, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.01160706665528719, "frac_reward_zero_std": 0.703125, "grad_norm": 0.6091484427452087, "kl": 0.0023894309997558594, "learning_rate": 2.17687074829932e-06, "loss": 0.0014, "num_tokens": 6766810.0, "reward": 0.0002899169921875, "reward_std": 0.11323677003383636, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.458984375, "rewards/tag_count_reward/std": 0.2712927460670471, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 2798.65625, "completions/mean_terminated_length": 1820.4210205078125, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.01228983528206879, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2499501258134842, "kl": 0.002429962158203125, "learning_rate": 2.3129251700680273e-06, "loss": 0.0134, "num_tokens": 7147032.0, "reward": 0.000579833984375, "reward_std": 0.163030743598938, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.486328125, "rewards/tag_count_reward/std": 0.2846267819404602, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3527.0, "completions/mean_length": 3064.9375, "completions/mean_terminated_length": 2002.09521484375, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 0.012972603908850388, "frac_reward_zero_std": 0.734375, "grad_norm": 0.23864814639091492, "kl": 0.0025987625122070312, "learning_rate": 2.4489795918367347e-06, "loss": 0.0124, "num_tokens": 7560082.0, "reward": 0.000732421875, "reward_std": 0.24862462282180786, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.45703125, "rewards/tag_count_reward/std": 0.307564377784729, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3482.0, "completions/mean_length": 2959.5625, "completions/mean_terminated_length": 2016.784423828125, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "epoch": 0.013655372535631987, "frac_reward_zero_std": 0.765625, "grad_norm": 0.19558818638324738, "kl": 0.0030012130737304688, "learning_rate": 2.5850340136054425e-06, "loss": -0.0023, "num_tokens": 7959998.0, "reward": 0.000225067138671875, "reward_std": 0.13248002529144287, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.50390625, "rewards/tag_count_reward/std": 0.3244989812374115, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3420.0, "completions/mean_length": 3039.046875, "completions/mean_terminated_length": 2099.872314453125, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.014338141162413586, "frac_reward_zero_std": 0.65625, "grad_norm": 0.22955650091171265, "kl": 0.0032596588134765625, "learning_rate": 2.7210884353741503e-06, "loss": 0.01, "num_tokens": 8369804.0, "reward": 0.000701904296875, "reward_std": 0.17948487401008606, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.521484375, "rewards/tag_count_reward/std": 0.32151728868484497, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 3000.8515625, "completions/mean_terminated_length": 2226.8544921875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.015020909789195187, "frac_reward_zero_std": 0.65625, "grad_norm": 0.23734480142593384, "kl": 0.0031490325927734375, "learning_rate": 2.8571428571428573e-06, "loss": 0.0013, "num_tokens": 8773169.0, "reward": 0.00072479248046875, "reward_std": 0.23179283738136292, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5546875, "rewards/tag_count_reward/std": 0.33340510725975037, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3471.0, "completions/mean_length": 2958.671875, "completions/mean_terminated_length": 2154.6787109375, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.015703678415976786, "frac_reward_zero_std": 0.609375, "grad_norm": 0.26028987765312195, "kl": 0.0034856796264648438, "learning_rate": 2.993197278911565e-06, "loss": 0.0139, "num_tokens": 9172685.0, "reward": -6.4849853515625e-05, "reward_std": 0.22091153264045715, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.55078125, "rewards/tag_count_reward/std": 0.3250671923160553, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3452.0, "completions/mean_length": 2942.109375, "completions/mean_terminated_length": 2090.1455078125, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "epoch": 0.016386447042758386, "frac_reward_zero_std": 0.71875, "grad_norm": 1.3826512098312378, "kl": 0.0051860809326171875, "learning_rate": 3.1292517006802725e-06, "loss": 0.0167, "num_tokens": 9570121.0, "reward": 0.00078582763671875, "reward_std": 0.17945250868797302, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.544921875, "rewards/tag_count_reward/std": 0.3356061577796936, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3548.0, "completions/mean_length": 2775.3515625, "completions/mean_terminated_length": 1914.5322265625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.017069215669539985, "frac_reward_zero_std": 0.6875, "grad_norm": 0.23183774948120117, "kl": 0.0037317276000976562, "learning_rate": 3.2653061224489794e-06, "loss": 0.0208, "num_tokens": 9944328.0, "reward": 0.000370025634765625, "reward_std": 0.254111111164093, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.578125, "rewards/tag_count_reward/std": 0.3490002751350403, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3471.0, "completions/mean_length": 2773.4296875, "completions/mean_terminated_length": 1937.1270751953125, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.017751984296321584, "frac_reward_zero_std": 0.71875, "grad_norm": 0.22315989434719086, "kl": 0.00357818603515625, "learning_rate": 3.4013605442176872e-06, "loss": 0.0017, "num_tokens": 10320301.0, "reward": 0.0007476806640625, "reward_std": 0.16857659816741943, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.595703125, "rewards/tag_count_reward/std": 0.34527334570884705, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3439.0, "completions/mean_length": 2754.8046875, "completions/mean_terminated_length": 1925.609375, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.018434752923103183, "frac_reward_zero_std": 0.6875, "grad_norm": 0.22715488076210022, "kl": 0.004376411437988281, "learning_rate": 3.537414965986395e-06, "loss": 0.0152, "num_tokens": 10693842.0, "reward": -0.000133514404296875, "reward_std": 0.22646816074848175, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.3628273606300354, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3515.0, "completions/mean_length": 2750.296875, "completions/mean_terminated_length": 1942.2462158203125, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "epoch": 0.019117521549884782, "frac_reward_zero_std": 0.703125, "grad_norm": 0.23173126578330994, "kl": 0.0041484832763671875, "learning_rate": 3.6734693877551024e-06, "loss": 0.0063, "num_tokens": 11065918.0, "reward": -3.814697265625e-05, "reward_std": 0.19604690372943878, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.634765625, "rewards/tag_count_reward/std": 0.35341742634773254, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3507.0, "completions/mean_length": 3037.46875, "completions/mean_terminated_length": 2212.313720703125, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.01980029017666638, "frac_reward_zero_std": 0.6875, "grad_norm": 0.23083865642547607, "kl": 0.0041332244873046875, "learning_rate": 3.80952380952381e-06, "loss": 0.0131, "num_tokens": 11477276.0, "reward": 0.000438690185546875, "reward_std": 0.21265748143196106, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.548828125, "rewards/tag_count_reward/std": 0.3466069996356964, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3507.0, "completions/mean_length": 2811.78125, "completions/mean_terminated_length": 1936.60009765625, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.02048305880344798, "frac_reward_zero_std": 0.71875, "grad_norm": 0.40242594480514526, "kl": 0.005270957946777344, "learning_rate": 3.945578231292517e-06, "loss": 0.0115, "num_tokens": 11857446.0, "reward": 0.000370025634765625, "reward_std": 0.19882522523403168, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58203125, "rewards/tag_count_reward/std": 0.35231152176856995, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 2682.8984375, "completions/mean_terminated_length": 1912.391357421875, "completions/min_length": 591.0, "completions/min_terminated_length": 591.0, "epoch": 0.02116582743022958, "frac_reward_zero_std": 0.6875, "grad_norm": 0.23005975782871246, "kl": 0.005336761474609375, "learning_rate": 4.081632653061225e-06, "loss": 0.0147, "num_tokens": 12220643.0, "reward": 0.002414703369140625, "reward_std": 0.24584628641605377, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.662109375, "rewards/tag_count_reward/std": 0.35714009404182434, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3530.0, "completions/mean_length": 2564.125, "completions/mean_terminated_length": 1819.8919677734375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.021848596057011182, "frac_reward_zero_std": 0.703125, "grad_norm": 0.25696730613708496, "kl": 0.0056324005126953125, "learning_rate": 4.217687074829933e-06, "loss": 0.0124, "num_tokens": 12570379.0, "reward": 0.001094818115234375, "reward_std": 0.30667799711227417, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.662109375, "rewards/tag_count_reward/std": 0.3529820144176483, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3516.0, "completions/mean_length": 2699.609375, "completions/mean_terminated_length": 1758.1612548828125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.02253136468379278, "frac_reward_zero_std": 0.703125, "grad_norm": 0.2062237411737442, "kl": 0.0054187774658203125, "learning_rate": 4.35374149659864e-06, "loss": 0.0009, "num_tokens": 12936893.0, "reward": 0.001766204833984375, "reward_std": 0.2625269889831543, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.607421875, "rewards/tag_count_reward/std": 0.35588914155960083, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3474.0, "completions/mean_length": 3011.5625, "completions/mean_terminated_length": 2174.923095703125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.02321413331057438, "frac_reward_zero_std": 0.71875, "grad_norm": 0.21290989220142365, "kl": 0.005505561828613281, "learning_rate": 4.489795918367348e-06, "loss": 0.0036, "num_tokens": 13341873.0, "reward": 0.00034332275390625, "reward_std": 0.2514190971851349, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.556640625, "rewards/tag_count_reward/std": 0.3552403748035431, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 2893.8125, "completions/mean_terminated_length": 2111.60009765625, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.02389690193735598, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2518894374370575, "kl": 0.006389617919921875, "learning_rate": 4.6258503401360546e-06, "loss": 0.0054, "num_tokens": 13733943.0, "reward": 0.001949310302734375, "reward_std": 0.2430625557899475, "rewards/accuracy_reward/mean": 0.1111111119389534, "rewards/accuracy_reward/std": 0.31552425026893616, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.3561321198940277, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3568.0, "completions/mean_length": 2758.9765625, "completions/mean_terminated_length": 1907.761962890625, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.02457967056413758, "frac_reward_zero_std": 0.8125, "grad_norm": 0.24950897693634033, "kl": 0.0064525604248046875, "learning_rate": 4.761904761904762e-06, "loss": 0.0016, "num_tokens": 14108654.0, "reward": 0.000835418701171875, "reward_std": 0.14640939235687256, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.615234375, "rewards/tag_count_reward/std": 0.36707818508148193, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 2627.984375, "completions/mean_terminated_length": 1757.58203125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.025262439190919177, "frac_reward_zero_std": 0.71875, "grad_norm": 0.6002601385116577, "kl": 0.0085296630859375, "learning_rate": 4.897959183673469e-06, "loss": 0.016, "num_tokens": 14466344.0, "reward": 0.000873565673828125, "reward_std": 0.18240346014499664, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6484375, "rewards/tag_count_reward/std": 0.3637586832046509, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3496.0, "completions/mean_length": 2844.9296875, "completions/mean_terminated_length": 1980.59326171875, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.025945207817700777, "frac_reward_zero_std": 0.78125, "grad_norm": 0.19975198805332184, "kl": 0.0067138671875, "learning_rate": 5.034013605442177e-06, "loss": 0.0107, "num_tokens": 14850485.0, "reward": 0.00069427490234375, "reward_std": 0.1932631880044937, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.609375, "rewards/tag_count_reward/std": 0.36954694986343384, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3550.0, "completions/mean_length": 2892.5703125, "completions/mean_terminated_length": 1813.93994140625, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.026627976444482376, "frac_reward_zero_std": 0.78125, "grad_norm": 0.4015129506587982, "kl": 0.007198333740234375, "learning_rate": 5.170068027210885e-06, "loss": 0.0069, "num_tokens": 15241846.0, "reward": 0.000873565673828125, "reward_std": 0.21831122040748596, "rewards/accuracy_reward/mean": 0.1269841343164444, "rewards/accuracy_reward/std": 0.33428436517715454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.556640625, "rewards/tag_count_reward/std": 0.3621007800102234, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 2893.78125, "completions/mean_terminated_length": 2006.357177734375, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "epoch": 0.027310745071263975, "frac_reward_zero_std": 0.8125, "grad_norm": 0.1919034868478775, "kl": 0.0071659088134765625, "learning_rate": 5.306122448979593e-06, "loss": -0.0002, "num_tokens": 15634130.0, "reward": 0.001003265380859375, "reward_std": 0.1768360435962677, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5859375, "rewards/tag_count_reward/std": 0.371787428855896, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 2886.0703125, "completions/mean_terminated_length": 2043.7413330078125, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 0.027993513698045574, "frac_reward_zero_std": 0.75, "grad_norm": 0.19289831817150116, "kl": 0.0070095062255859375, "learning_rate": 5.442176870748301e-06, "loss": 0.0223, "num_tokens": 16023733.0, "reward": 0.00014495849609375, "reward_std": 0.24029502272605896, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.603515625, "rewards/tag_count_reward/std": 0.36657509207725525, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3400.0, "completions/mean_length": 2989.8046875, "completions/mean_terminated_length": 2031.8162841796875, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "epoch": 0.028676282324827173, "frac_reward_zero_std": 0.78125, "grad_norm": 0.1830112785100937, "kl": 0.0068645477294921875, "learning_rate": 5.578231292517007e-06, "loss": 0.0054, "num_tokens": 16426306.0, "reward": -0.0001373291015625, "reward_std": 0.14635004103183746, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.552734375, "rewards/tag_count_reward/std": 0.3640492260456085, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3478.0, "completions/mean_length": 3038.5390625, "completions/mean_terminated_length": 2098.4892578125, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.029359050951608772, "frac_reward_zero_std": 0.734375, "grad_norm": 0.23315423727035522, "kl": 0.0076847076416015625, "learning_rate": 5.7142857142857145e-06, "loss": 0.0059, "num_tokens": 16836653.0, "reward": 0.000499725341796875, "reward_std": 0.20444661378860474, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.546875, "rewards/tag_count_reward/std": 0.35873520374298096, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3565.0, "completions/mean_length": 2935.390625, "completions/mean_terminated_length": 1923.5599365234375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.030041819578390375, "frac_reward_zero_std": 0.828125, "grad_norm": 77.0829086303711, "kl": 0.3493499755859375, "learning_rate": 5.850340136054422e-06, "loss": 0.0057, "num_tokens": 17234357.0, "reward": 0.002758026123046875, "reward_std": 0.14082036912441254, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.548828125, "rewards/tag_count_reward/std": 0.3659452497959137, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3415.0, "completions/mean_length": 2836.5625, "completions/mean_terminated_length": 1962.440673828125, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 0.030724588205171974, "frac_reward_zero_std": 0.703125, "grad_norm": 0.22213716804981232, "kl": 0.00948333740234375, "learning_rate": 5.98639455782313e-06, "loss": 0.0095, "num_tokens": 17617305.0, "reward": 0.00104522705078125, "reward_std": 0.26501399278640747, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62109375, "rewards/tag_count_reward/std": 0.36718815565109253, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3550.0, "completions/mean_length": 2927.9765625, "completions/mean_terminated_length": 2229.62890625, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 0.03140735683195357, "frac_reward_zero_std": 0.734375, "grad_norm": 0.24749751389026642, "kl": 0.0098724365234375, "learning_rate": 6.122448979591837e-06, "loss": 0.0106, "num_tokens": 18013336.0, "reward": 0.000614166259765625, "reward_std": 0.2623651623725891, "rewards/accuracy_reward/mean": 0.14516128599643707, "rewards/accuracy_reward/std": 0.3536924421787262, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.634765625, "rewards/tag_count_reward/std": 0.3630339503288269, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3571.0, "completions/mean_length": 2876.0234375, "completions/mean_terminated_length": 1841.28857421875, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.03209012545873517, "frac_reward_zero_std": 0.859375, "grad_norm": 0.31160029768943787, "kl": 0.0100860595703125, "learning_rate": 6.258503401360545e-06, "loss": 0.0057, "num_tokens": 18401097.0, "reward": 0.000629425048828125, "reward_std": 0.10492339730262756, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5625, "rewards/tag_count_reward/std": 0.36854684352874756, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2954.65625, "completions/mean_terminated_length": 2064.075439453125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.03277289408551677, "frac_reward_zero_std": 0.765625, "grad_norm": 0.22360989451408386, "kl": 0.010662078857421875, "learning_rate": 6.394557823129253e-06, "loss": -0.0058, "num_tokens": 18799499.0, "reward": 0.00084686279296875, "reward_std": 0.18236568570137024, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5859375, "rewards/tag_count_reward/std": 0.36510905623435974, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 2898.7734375, "completions/mean_terminated_length": 1959.75927734375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "epoch": 0.03345566271229837, "frac_reward_zero_std": 0.75, "grad_norm": 0.20342294871807098, "kl": 0.01105499267578125, "learning_rate": 6.530612244897959e-06, "loss": 0.021, "num_tokens": 19191440.0, "reward": 0.0019683837890625, "reward_std": 0.2374681532382965, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.578125, "rewards/tag_count_reward/std": 0.36954694986343384, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 2807.671875, "completions/mean_terminated_length": 1954.9835205078125, "completions/min_length": 583.0, "completions/min_terminated_length": 583.0, "epoch": 0.03413843133907997, "frac_reward_zero_std": 0.671875, "grad_norm": 0.30983856320381165, "kl": 0.0111083984375, "learning_rate": 6.666666666666667e-06, "loss": 0.0296, "num_tokens": 19571354.0, "reward": 0.0010528564453125, "reward_std": 0.29831066727638245, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62109375, "rewards/tag_count_reward/std": 0.37118715047836304, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3543.0, "completions/mean_length": 2973.1796875, "completions/mean_terminated_length": 2212.333251953125, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 0.03482119996586157, "frac_reward_zero_std": 0.765625, "grad_norm": 0.18846933543682098, "kl": 0.011409759521484375, "learning_rate": 6.8027210884353745e-06, "loss": 0.0091, "num_tokens": 19971757.0, "reward": -0.000263214111328125, "reward_std": 0.19611704349517822, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.599609375, "rewards/tag_count_reward/std": 0.37033164501190186, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3437.0, "completions/mean_length": 2552.0703125, "completions/mean_terminated_length": 1846.01318359375, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.03550396859264317, "frac_reward_zero_std": 0.703125, "grad_norm": 0.2487255185842514, "kl": 0.012561798095703125, "learning_rate": 6.938775510204082e-06, "loss": 0.0039, "num_tokens": 20319594.0, "reward": 0.001209259033203125, "reward_std": 0.3313322365283966, "rewards/accuracy_reward/mean": 0.24193547666072845, "rewards/accuracy_reward/std": 0.42999276518821716, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.68359375, "rewards/tag_count_reward/std": 0.36517223715782166, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3526.0, "completions/mean_length": 2863.078125, "completions/mean_terminated_length": 1965.0877685546875, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.03618673721942477, "frac_reward_zero_std": 0.84375, "grad_norm": 0.2549895942211151, "kl": 0.0140380859375, "learning_rate": 7.07482993197279e-06, "loss": 0.0133, "num_tokens": 20707316.0, "reward": 0.00030517578125, "reward_std": 0.12158792465925217, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59375, "rewards/tag_count_reward/std": 0.36987972259521484, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3500.0, "completions/mean_length": 2757.4296875, "completions/mean_terminated_length": 1877.5322265625, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "epoch": 0.036869505846206366, "frac_reward_zero_std": 0.78125, "grad_norm": 6405.5068359375, "kl": 1.8256759643554688, "learning_rate": 7.210884353741497e-06, "loss": 0.0764, "num_tokens": 21081449.0, "reward": 0.000980377197265625, "reward_std": 0.2044573724269867, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62890625, "rewards/tag_count_reward/std": 0.37118715047836304, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.3125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3478.0, "completions/mean_length": 2622.609375, "completions/mean_terminated_length": 1921.0540771484375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.037552274472987965, "frac_reward_zero_std": 0.734375, "grad_norm": 0.22383847832679749, "kl": 0.014461517333984375, "learning_rate": 7.346938775510205e-06, "loss": 0.0192, "num_tokens": 21436759.0, "reward": 0.00052642822265625, "reward_std": 0.22101403772830963, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6875, "rewards/tag_count_reward/std": 0.3672090768814087, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2823.9375, "completions/mean_terminated_length": 1989.1146240234375, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.038235043099769564, "frac_reward_zero_std": 0.84375, "grad_norm": 0.27166351675987244, "kl": 0.0138397216796875, "learning_rate": 7.482993197278913e-06, "loss": 0.0093, "num_tokens": 21818477.0, "reward": 0.0005950927734375, "reward_std": 0.14924165606498718, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.3746512532234192, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3429.0, "completions/mean_length": 2562.4921875, "completions/mean_terminated_length": 1661.1617431640625, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.03891781172655116, "frac_reward_zero_std": 0.765625, "grad_norm": 0.24227580428123474, "kl": 0.016559600830078125, "learning_rate": 7.61904761904762e-06, "loss": -0.0003, "num_tokens": 22166894.0, "reward": 0.000736236572265625, "reward_std": 0.1824897825717926, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.37416577339172363, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.36553001403808594, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 2906.65625, "completions/mean_terminated_length": 1948.1510009765625, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.03960058035333276, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2006242722272873, "kl": 0.015140533447265625, "learning_rate": 7.755102040816327e-06, "loss": 0.0129, "num_tokens": 22557526.0, "reward": 0.00145721435546875, "reward_std": 0.2016143500804901, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.583984375, "rewards/tag_count_reward/std": 0.3662393093109131, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3534.0, "completions/mean_length": 2927.015625, "completions/mean_terminated_length": 1997.32080078125, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.04028334898011436, "frac_reward_zero_std": 0.6875, "grad_norm": 0.25060155987739563, "kl": 0.016963958740234375, "learning_rate": 7.891156462585034e-06, "loss": 0.0058, "num_tokens": 22952552.0, "reward": 0.000652313232421875, "reward_std": 0.20165210962295532, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22826264798641205, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.591796875, "rewards/tag_count_reward/std": 0.3643448054790497, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3311.0, "completions/mean_length": 2959.1796875, "completions/mean_terminated_length": 2102.944580078125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.04096611760689596, "frac_reward_zero_std": 0.75, "grad_norm": 0.21332313120365143, "kl": 0.018299102783203125, "learning_rate": 8.027210884353741e-06, "loss": 0.0175, "num_tokens": 23352139.0, "reward": 0.000980377197265625, "reward_std": 0.27883005142211914, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.583984375, "rewards/tag_count_reward/std": 0.3662393093109131, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3335.0, "completions/mean_length": 3028.578125, "completions/mean_terminated_length": 2071.361572265625, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "epoch": 0.04164888623367756, "frac_reward_zero_std": 0.765625, "grad_norm": 2.1824116706848145, "kl": 0.04326629638671875, "learning_rate": 8.16326530612245e-06, "loss": 0.019, "num_tokens": 23759867.0, "reward": -0.00037384033203125, "reward_std": 0.19039314985275269, "rewards/accuracy_reward/mean": 0.1269841343164444, "rewards/accuracy_reward/std": 0.33428436517715454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.552734375, "rewards/tag_count_reward/std": 0.35997095704078674, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3384.0, "completions/mean_length": 2674.40625, "completions/mean_terminated_length": 1792.800048828125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.04233165486045916, "frac_reward_zero_std": 0.875, "grad_norm": 1.7527108192443848, "kl": 0.02565765380859375, "learning_rate": 8.299319727891157e-06, "loss": 0.0077, "num_tokens": 24122297.0, "reward": 0.000232696533203125, "reward_std": 0.12444177269935608, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.638671875, "rewards/tag_count_reward/std": 0.3735978901386261, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3267.0, "completions/mean_length": 2867.2265625, "completions/mean_terminated_length": 1672.604248046875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.043014423487240765, "frac_reward_zero_std": 0.859375, "grad_norm": 0.24444209039211273, "kl": 0.0247039794921875, "learning_rate": 8.435374149659866e-06, "loss": 0.0173, "num_tokens": 24511502.0, "reward": 0.00092315673828125, "reward_std": 0.1712632179260254, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.54296875, "rewards/tag_count_reward/std": 0.3633144795894623, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 2773.3515625, "completions/mean_terminated_length": 1882.9671630859375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.043697192114022364, "frac_reward_zero_std": 0.90625, "grad_norm": 1.2711641788482666, "kl": 0.0254058837890625, "learning_rate": 8.571428571428571e-06, "loss": 0.0043, "num_tokens": 24886755.0, "reward": 0.000362396240234375, "reward_std": 0.04139427840709686, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.615234375, "rewards/tag_count_reward/std": 0.3737213611602783, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3488.0, "completions/mean_length": 2682.109375, "completions/mean_terminated_length": 1886.323486328125, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.04437996074080396, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2437221258878708, "kl": 0.02806854248046875, "learning_rate": 8.70748299319728e-06, "loss": 0.0146, "num_tokens": 25251419.0, "reward": 0.00191497802734375, "reward_std": 0.2734946012496948, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.662109375, "rewards/tag_count_reward/std": 0.36799874901771545, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3432.0, "completions/mean_length": 2955.7578125, "completions/mean_terminated_length": 2007.2353515625, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.04506272936758556, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2303735911846161, "kl": 0.029571533203125, "learning_rate": 8.843537414965987e-06, "loss": 0.0096, "num_tokens": 25651890.0, "reward": 0.001682281494140625, "reward_std": 0.28999727964401245, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.568359375, "rewards/tag_count_reward/std": 0.3654826879501343, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3481.0, "completions/mean_length": 2760.3359375, "completions/mean_terminated_length": 2056.04345703125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.04574549799436716, "frac_reward_zero_std": 0.734375, "grad_norm": 0.20994165539741516, "kl": 0.03081512451171875, "learning_rate": 8.979591836734695e-06, "loss": 0.0144, "num_tokens": 26025045.0, "reward": 0.00035858154296875, "reward_std": 0.24585166573524475, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.662109375, "rewards/tag_count_reward/std": 0.37198901176452637, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3504.0, "completions/mean_length": 2776.21875, "completions/mean_terminated_length": 1801.310302734375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.04642826662114876, "frac_reward_zero_std": 0.84375, "grad_norm": 0.20171038806438446, "kl": 0.03635406494140625, "learning_rate": 9.115646258503402e-06, "loss": 0.0056, "num_tokens": 26400905.0, "reward": 0.000988006591796875, "reward_std": 0.18240347504615784, "rewards/accuracy_reward/mean": 0.1984127014875412, "rewards/accuracy_reward/std": 0.4003966450691223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59375, "rewards/tag_count_reward/std": 0.37384992837905884, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 2936.4921875, "completions/mean_terminated_length": 2179.2373046875, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "epoch": 0.04711103524793036, "frac_reward_zero_std": 0.75, "grad_norm": 1001.0767211914062, "kl": 17.23780059814453, "learning_rate": 9.251700680272109e-06, "loss": 0.7116, "num_tokens": 26797726.0, "reward": 0.00084686279296875, "reward_std": 0.251278817653656, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.611328125, "rewards/tag_count_reward/std": 0.3709540367126465, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3470.0, "completions/mean_length": 2773.796875, "completions/mean_terminated_length": 1764.5965576171875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.04779380387471196, "frac_reward_zero_std": 0.796875, "grad_norm": 0.41836145520210266, "kl": 0.03790283203125, "learning_rate": 9.387755102040818e-06, "loss": 0.0148, "num_tokens": 27173154.0, "reward": 0.000598907470703125, "reward_std": 0.1408850997686386, "rewards/accuracy_reward/mean": 0.0873015895485878, "rewards/accuracy_reward/std": 0.2834033668041229, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.591796875, "rewards/tag_count_reward/std": 0.3723609149456024, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 2720.921875, "completions/mean_terminated_length": 1802.1612548828125, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.04847657250149356, "frac_reward_zero_std": 0.765625, "grad_norm": 0.26940470933914185, "kl": 0.04000091552734375, "learning_rate": 9.523809523809525e-06, "loss": 0.0068, "num_tokens": 27542808.0, "reward": 0.000911712646484375, "reward_std": 0.265267550945282, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.625, "rewards/tag_count_reward/std": 0.3725312352180481, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3437.0, "completions/mean_length": 2874.6796875, "completions/mean_terminated_length": 2045.1356201171875, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.04915934112827516, "frac_reward_zero_std": 0.671875, "grad_norm": 0.24734152853488922, "kl": 0.0461883544921875, "learning_rate": 9.659863945578232e-06, "loss": 0.0259, "num_tokens": 27931119.0, "reward": 0.002407073974609375, "reward_std": 0.34816399216651917, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.609375, "rewards/tag_count_reward/std": 0.370876282453537, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3322.0, "completions/mean_length": 2482.6953125, "completions/mean_terminated_length": 1598.54931640625, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "epoch": 0.049842109755056756, "frac_reward_zero_std": 0.8125, "grad_norm": 0.27174755930900574, "kl": 0.0435333251953125, "learning_rate": 9.795918367346939e-06, "loss": 0.0159, "num_tokens": 28268244.0, "reward": 0.000179290771484375, "reward_std": 0.20714402198791504, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.666015625, "rewards/tag_count_reward/std": 0.37421485781669617, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3325.0, "completions/mean_length": 2721.0078125, "completions/mean_terminated_length": 1773.131103515625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.050524878381838355, "frac_reward_zero_std": 0.828125, "grad_norm": 0.19484789669513702, "kl": 0.049285888671875, "learning_rate": 9.931972789115647e-06, "loss": 0.0167, "num_tokens": 28637547.0, "reward": -0.00023651123046875, "reward_std": 0.20150645077228546, "rewards/accuracy_reward/mean": 0.1349206417798996, "rewards/accuracy_reward/std": 0.343002587556839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.611328125, "rewards/tag_count_reward/std": 0.3749128580093384, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3457.0, "completions/mean_length": 2996.8203125, "completions/mean_terminated_length": 1750.8536376953125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.051207647008619954, "frac_reward_zero_std": 0.90625, "grad_norm": 0.1501264125108719, "kl": 0.050567626953125, "learning_rate": 1.0068027210884354e-05, "loss": 0.0085, "num_tokens": 29041578.0, "reward": 0.000247955322265625, "reward_std": 0.06354530900716782, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.490234375, "rewards/tag_count_reward/std": 0.3513225018978119, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3479.0, "completions/mean_length": 2996.1953125, "completions/mean_terminated_length": 1983.170166015625, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 0.05189041563540155, "frac_reward_zero_std": 0.828125, "grad_norm": 0.1916869878768921, "kl": 0.060302734375, "learning_rate": 1.0204081632653063e-05, "loss": 0.0079, "num_tokens": 29446965.0, "reward": -0.0009765625, "reward_std": 0.1851494163274765, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.525390625, "rewards/tag_count_reward/std": 0.3629492223262787, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3377.0, "completions/mean_length": 2955.5390625, "completions/mean_terminated_length": 2066.20751953125, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.05257318426218315, "frac_reward_zero_std": 0.78125, "grad_norm": 0.20960082113742828, "kl": 0.0486602783203125, "learning_rate": 1.034013605442177e-05, "loss": 0.0292, "num_tokens": 29845144.0, "reward": 6.4849853515625e-05, "reward_std": 0.24845737218856812, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5859375, "rewards/tag_count_reward/std": 0.3677949607372284, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 3039.0546875, "completions/mean_terminated_length": 2099.8935546875, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 0.05325595288896475, "frac_reward_zero_std": 0.796875, "grad_norm": 0.2618347704410553, "kl": 0.051513671875, "learning_rate": 1.0476190476190477e-05, "loss": -0.0031, "num_tokens": 30252963.0, "reward": 0.001010894775390625, "reward_std": 0.14087432622909546, "rewards/accuracy_reward/mean": 0.1587301641702652, "rewards/accuracy_reward/std": 0.3668830394744873, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.529296875, "rewards/tag_count_reward/std": 0.3626524806022644, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3557.0, "completions/mean_length": 2955.546875, "completions/mean_terminated_length": 1975.159912109375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.05393872151574635, "frac_reward_zero_std": 0.796875, "grad_norm": 0.17245353758335114, "kl": 0.049591064453125, "learning_rate": 1.0612244897959186e-05, "loss": 0.0149, "num_tokens": 30651341.0, "reward": 0.0001983642578125, "reward_std": 0.21557608246803284, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.54296875, "rewards/tag_count_reward/std": 0.3673556447029114, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3551.0, "completions/mean_length": 2876.4921875, "completions/mean_terminated_length": 1875.3018798828125, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.05462149014252795, "frac_reward_zero_std": 0.796875, "grad_norm": 0.6020998358726501, "kl": 0.051025390625, "learning_rate": 1.0748299319727893e-05, "loss": 0.0126, "num_tokens": 31040454.0, "reward": 0.000537872314453125, "reward_std": 0.25402480363845825, "rewards/accuracy_reward/mean": 0.2380952388048172, "rewards/accuracy_reward/std": 0.42761799693107605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5703125, "rewards/tag_count_reward/std": 0.3684633672237396, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 2868.4609375, "completions/mean_terminated_length": 2082.540771484375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.05530425876930955, "frac_reward_zero_std": 0.84375, "grad_norm": 0.5201172232627869, "kl": 0.0519256591796875, "learning_rate": 1.0884353741496601e-05, "loss": 0.0058, "num_tokens": 31428703.0, "reward": 4.9591064453125e-05, "reward_std": 0.11865855008363724, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.615234375, "rewards/tag_count_reward/std": 0.3737213611602783, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3527.0, "completions/mean_length": 2931.8828125, "completions/mean_terminated_length": 2038.24072265625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.05598702739609115, "frac_reward_zero_std": 0.8125, "grad_norm": 0.16862784326076508, "kl": 0.0520172119140625, "learning_rate": 1.1020408163265306e-05, "loss": 0.0193, "num_tokens": 31824572.0, "reward": -2.288818359375e-05, "reward_std": 0.18232254683971405, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58203125, "rewards/tag_count_reward/std": 0.36869287490844727, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2902.2890625, "completions/mean_terminated_length": 2129.683349609375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.05666979602287275, "frac_reward_zero_std": 0.625, "grad_norm": 0.23633578419685364, "kl": 0.0478515625, "learning_rate": 1.1156462585034013e-05, "loss": 0.033, "num_tokens": 32216469.0, "reward": 0.00135040283203125, "reward_std": 0.3315965533256531, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.3723454177379608, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 3001.6484375, "completions/mean_terminated_length": 2228.708984375, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.057352564649654346, "frac_reward_zero_std": 0.703125, "grad_norm": 0.325137197971344, "kl": 0.05816650390625, "learning_rate": 1.1292517006802722e-05, "loss": 0.0124, "num_tokens": 32621572.0, "reward": 0.001155853271484375, "reward_std": 0.2237599790096283, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.591796875, "rewards/tag_count_reward/std": 0.36837467551231384, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3458.0, "completions/mean_length": 2994.1875, "completions/mean_terminated_length": 2074.079833984375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.058035333276435945, "frac_reward_zero_std": 0.765625, "grad_norm": 0.19226306676864624, "kl": 0.0503692626953125, "learning_rate": 1.1428571428571429e-05, "loss": 0.0206, "num_tokens": 33025138.0, "reward": 0.001537322998046875, "reward_std": 0.19341963529586792, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5546875, "rewards/tag_count_reward/std": 0.36578238010406494, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3518.0, "completions/mean_length": 2806.734375, "completions/mean_terminated_length": 2029.46875, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "epoch": 0.058718101903217544, "frac_reward_zero_std": 0.796875, "grad_norm": 0.1984071582555771, "kl": 0.0474853515625, "learning_rate": 1.1564625850340136e-05, "loss": 0.0158, "num_tokens": 33406072.0, "reward": 0.002166748046875, "reward_std": 0.18222543597221375, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.63671875, "rewards/tag_count_reward/std": 0.3723454177379608, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3557.0, "completions/mean_length": 2901.484375, "completions/mean_terminated_length": 2023.96435546875, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "epoch": 0.05940087052999914, "frac_reward_zero_std": 0.734375, "grad_norm": 0.305919349193573, "kl": 0.0582427978515625, "learning_rate": 1.1700680272108845e-05, "loss": 0.0068, "num_tokens": 33798208.0, "reward": 0.000659942626953125, "reward_std": 0.22651132941246033, "rewards/accuracy_reward/mean": 0.190476194024086, "rewards/accuracy_reward/std": 0.3942442834377289, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6015625, "rewards/tag_count_reward/std": 0.3677949607372284, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3454.0, "completions/mean_length": 3092.203125, "completions/mean_terminated_length": 2153.318359375, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "epoch": 0.06008363915678075, "frac_reward_zero_std": 0.828125, "grad_norm": 0.1946588158607483, "kl": 0.055511474609375, "learning_rate": 1.1836734693877552e-05, "loss": 0.0017, "num_tokens": 34214982.0, "reward": -0.000263214111328125, "reward_std": 0.11038293689489365, "rewards/accuracy_reward/mean": 0.1269841343164444, "rewards/accuracy_reward/std": 0.33428436517715454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.3563264012336731, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3467.0, "completions/mean_length": 2994.59375, "completions/mean_terminated_length": 1829.4884033203125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.06076640778356235, "frac_reward_zero_std": 0.828125, "grad_norm": 0.21530690789222717, "kl": 0.055999755859375, "learning_rate": 1.197278911564626e-05, "loss": 0.013, "num_tokens": 34619146.0, "reward": 0.001010894775390625, "reward_std": 0.1959228217601776, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.513671875, "rewards/tag_count_reward/std": 0.3553701937198639, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3486.0, "completions/mean_length": 2994.4296875, "completions/mean_terminated_length": 1943.45654296875, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.06144917641034395, "frac_reward_zero_std": 0.78125, "grad_norm": 0.8442981839179993, "kl": 0.062225341796875, "learning_rate": 1.2108843537414967e-05, "loss": 0.0002, "num_tokens": 35024113.0, "reward": 0.000217437744140625, "reward_std": 0.14361488819122314, "rewards/accuracy_reward/mean": 0.1190476194024086, "rewards/accuracy_reward/std": 0.32513731718063354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.55078125, "rewards/tag_count_reward/std": 0.35819894075393677, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3410.0, "completions/mean_length": 2968.9921875, "completions/mean_terminated_length": 1753.279052734375, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 0.06213194503712555, "frac_reward_zero_std": 0.890625, "grad_norm": 0.16523298621177673, "kl": 0.05694580078125, "learning_rate": 1.2244897959183674e-05, "loss": -0.0001, "num_tokens": 35425398.0, "reward": 0.000301361083984375, "reward_std": 0.08008575439453125, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.513671875, "rewards/tag_count_reward/std": 0.3553701937198639, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3495.0, "completions/mean_length": 3055.75, "completions/mean_terminated_length": 2047.2728271484375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.06281471366390715, "frac_reward_zero_std": 0.875, "grad_norm": 0.2896662652492523, "kl": 0.061492919921875, "learning_rate": 1.2380952380952383e-05, "loss": 0.008, "num_tokens": 35836856.0, "reward": 0.000789642333984375, "reward_std": 0.13813376426696777, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.515625, "rewards/tag_count_reward/std": 0.35736072063446045, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3408.0, "completions/mean_length": 2961.109375, "completions/mean_terminated_length": 1922.9583740234375, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.06349748229068874, "frac_reward_zero_std": 0.78125, "grad_norm": 0.42472153902053833, "kl": 0.072906494140625, "learning_rate": 1.251700680272109e-05, "loss": 0.0155, "num_tokens": 36237138.0, "reward": 0.002079010009765625, "reward_std": 0.1656903773546219, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.37416577339172363, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.546875, "rewards/tag_count_reward/std": 0.3628273606300354, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 2843.9609375, "completions/mean_terminated_length": 2126.6923828125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.06418025091747034, "frac_reward_zero_std": 0.796875, "grad_norm": 0.1708218902349472, "kl": 0.0665435791015625, "learning_rate": 1.2653061224489798e-05, "loss": 0.0062, "num_tokens": 36621545.0, "reward": 0.000835418701171875, "reward_std": 0.17405231297016144, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.634765625, "rewards/tag_count_reward/std": 0.3750358819961548, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3577.0, "completions/mean_length": 2785.140625, "completions/mean_terminated_length": 1907.704833984375, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "epoch": 0.06486301954425194, "frac_reward_zero_std": 0.75, "grad_norm": 0.20617185533046722, "kl": 0.06744384765625, "learning_rate": 1.2789115646258505e-05, "loss": 0.0182, "num_tokens": 36997949.0, "reward": 0.001251220703125, "reward_std": 0.23479235172271729, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.619140625, "rewards/tag_count_reward/std": 0.36983296275138855, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 2894.3671875, "completions/mean_terminated_length": 2087.847412109375, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.06554578817103354, "frac_reward_zero_std": 0.75, "grad_norm": 2.6165342330932617, "kl": 0.0714874267578125, "learning_rate": 1.2925170068027212e-05, "loss": 0.0166, "num_tokens": 37389310.0, "reward": 0.001499176025390625, "reward_std": 0.1436256766319275, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62109375, "rewards/tag_count_reward/std": 0.3658454418182373, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 2910.828125, "completions/mean_terminated_length": 1926.961669921875, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "epoch": 0.06622855679781514, "frac_reward_zero_std": 0.703125, "grad_norm": 0.22562777996063232, "kl": 0.0776824951171875, "learning_rate": 1.3061224489795918e-05, "loss": 0.0179, "num_tokens": 37783408.0, "reward": 0.001220703125, "reward_std": 0.2375113070011139, "rewards/accuracy_reward/mean": 0.1190476194024086, "rewards/accuracy_reward/std": 0.32513731718063354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5625, "rewards/tag_count_reward/std": 0.36854684352874756, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 2856.5, "completions/mean_terminated_length": 2214.588134765625, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.06691132542459674, "frac_reward_zero_std": 0.71875, "grad_norm": 0.21397413313388824, "kl": 0.07177734375, "learning_rate": 1.3197278911564626e-05, "loss": 0.0013, "num_tokens": 38169006.0, "reward": 0.0004119873046875, "reward_std": 0.19331714510917664, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.3673556447029114, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3538.0, "completions/mean_length": 2946.0546875, "completions/mean_terminated_length": 2245.360595703125, "completions/min_length": 524.0, "completions/min_terminated_length": 524.0, "epoch": 0.06759409405137834, "frac_reward_zero_std": 0.75, "grad_norm": 0.20259307324886322, "kl": 0.0770263671875, "learning_rate": 1.3333333333333333e-05, "loss": 0.0211, "num_tokens": 38565695.0, "reward": 0.0014495849609375, "reward_std": 0.25680309534072876, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.630859375, "rewards/tag_count_reward/std": 0.36849987506866455, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 2963.0859375, "completions/mean_terminated_length": 2138.963623046875, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.06827686267815994, "frac_reward_zero_std": 0.828125, "grad_norm": 0.16437853872776031, "kl": 0.0745697021484375, "learning_rate": 1.3469387755102042e-05, "loss": 0.0213, "num_tokens": 38964378.0, "reward": 0.002105712890625, "reward_std": 0.16297680139541626, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.587890625, "rewards/tag_count_reward/std": 0.3693336248397827, "step": 100 } ], "logging_steps": 1, "max_steps": 1465, "num_input_tokens_seen": 38964378, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }