{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.034139159749930655, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3384.0, "completions/mean_length": 3031.375, "completions/mean_terminated_length": 2223.6923828125, "completions/min_length": 628.0, "completions/min_terminated_length": 628.0, "epoch": 0.00034139159749930657, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3454612195491791, "kl": 0.0014934539794921875, "learning_rate": 0.0, "loss": 0.0131, "num_tokens": 408948.0, "reward": -0.000518798828125, "reward_std": 0.5122510194778442, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.1874077022075653, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 3004.4296875, "completions/mean_terminated_length": 2100.300048828125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.0006827831949986131, "frac_reward_zero_std": 0.53125, "grad_norm": 0.29601675271987915, "kl": 0.0014243125915527344, "learning_rate": 6.825938566552902e-08, "loss": 0.0139, "num_tokens": 813103.0, "reward": 0.002353668212890625, "reward_std": 0.4794514775276184, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.388671875, "rewards/tag_count_reward/std": 0.1929071843624115, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3513.0, "completions/mean_length": 2913.28125, "completions/mean_terminated_length": 2050.9287109375, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.0010241747924979196, "frac_reward_zero_std": 0.46875, "grad_norm": 0.3300672471523285, "kl": 0.00140380859375, "learning_rate": 1.3651877133105803e-07, "loss": 0.0384, "num_tokens": 1207751.0, "reward": 0.002513885498046875, "reward_std": 0.5358012318611145, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.404296875, "rewards/tag_count_reward/std": 0.1987190544605255, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3474.0, "completions/mean_length": 3104.0078125, "completions/mean_terminated_length": 2048.025146484375, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 0.0013655663899972263, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3199569880962372, "kl": 0.0014719963073730469, "learning_rate": 2.0477815699658704e-07, "loss": -0.0044, "num_tokens": 1626444.0, "reward": 0.002269744873046875, "reward_std": 0.39746206998825073, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.12199576199054718, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3557.0, "completions/mean_length": 2822.0703125, "completions/mean_terminated_length": 1958.550048828125, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.0017069579874965327, "frac_reward_zero_std": 0.5, "grad_norm": 0.5131797194480896, "kl": 0.0017642974853515625, "learning_rate": 2.7303754266211607e-07, "loss": 0.0246, "num_tokens": 2009325.0, "reward": 0.002338409423828125, "reward_std": 0.5197770595550537, "rewards/accuracy_reward/mean": 0.1854838728904724, "rewards/accuracy_reward/std": 0.39026644825935364, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.400390625, "rewards/tag_count_reward/std": 0.17563115060329437, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3441.0, "completions/mean_length": 2976.859375, "completions/mean_terminated_length": 1894.5653076171875, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 0.002048349584995839, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3703017830848694, "kl": 0.0014200210571289062, "learning_rate": 3.412969283276451e-07, "loss": 0.0155, "num_tokens": 2413423.0, "reward": -3.4332275390625e-05, "reward_std": 0.32629644870758057, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.1658238172531128, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 3199.34375, "completions/mean_terminated_length": 2045.375, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 0.0023897411824951456, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3080924153327942, "kl": 0.0014858245849609375, "learning_rate": 4.0955631399317407e-07, "loss": 0.0213, "num_tokens": 2842315.0, "reward": 0.001033782958984375, "reward_std": 0.396534264087677, "rewards/accuracy_reward/mean": 0.08870967477560043, "rewards/accuracy_reward/std": 0.2854776978492737, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.15759754180908203, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3566.0, "completions/mean_length": 2934.2890625, "completions/mean_terminated_length": 2284.578125, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 0.0027311327799944525, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3579351305961609, "kl": 0.0013875961303710938, "learning_rate": 4.778156996587031e-07, "loss": 0.0331, "num_tokens": 3238360.0, "reward": 0.000652313232421875, "reward_std": 0.6936367750167847, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.45867621898651123, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4140625, "rewards/tag_count_reward/std": 0.18676035106182098, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3481.0, "completions/mean_length": 2918.296875, "completions/mean_terminated_length": 1913.2156982421875, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.003072524377493759, "frac_reward_zero_std": 0.59375, "grad_norm": 0.35903388261795044, "kl": 0.0016965866088867188, "learning_rate": 5.460750853242321e-07, "loss": 0.0375, "num_tokens": 3631630.0, "reward": 0.00307464599609375, "reward_std": 0.33484694361686707, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.18005220592021942, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3560.0, "completions/mean_length": 2982.34375, "completions/mean_terminated_length": 1979.5833740234375, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 0.0034139159749930655, "frac_reward_zero_std": 0.65625, "grad_norm": 0.2800871729850769, "kl": 0.0014004707336425781, "learning_rate": 6.143344709897612e-07, "loss": 0.0061, "num_tokens": 4032634.0, "reward": 0.002002716064453125, "reward_std": 0.3920637369155884, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.16291163861751556, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3438.0, "completions/mean_length": 3002.859375, "completions/mean_terminated_length": 2001.319091796875, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.003755307572492372, "frac_reward_zero_std": 0.53125, "grad_norm": 0.34766384959220886, "kl": 0.001445770263671875, "learning_rate": 6.825938566552902e-07, "loss": 0.002, "num_tokens": 4437176.0, "reward": 0.00319671630859375, "reward_std": 0.4675132632255554, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.1774279773235321, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3534.0, "completions/mean_length": 3045.2109375, "completions/mean_terminated_length": 2306.870361328125, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "epoch": 0.004096699169991678, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3157983422279358, "kl": 0.0015764236450195312, "learning_rate": 7.508532423208192e-07, "loss": 0.0143, "num_tokens": 4846087.0, "reward": 0.002231597900390625, "reward_std": 0.3572506606578827, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.13304568827152252, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 2860.609375, "completions/mean_terminated_length": 1768.431396484375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.004438090767490985, "frac_reward_zero_std": 0.53125, "grad_norm": 0.37694165110588074, "kl": 0.0015270709991455078, "learning_rate": 8.191126279863481e-07, "loss": 0.0346, "num_tokens": 5232477.0, "reward": 0.001003265380859375, "reward_std": 0.6925603747367859, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.388671875, "rewards/tag_count_reward/std": 0.17969954013824463, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3444.0, "completions/mean_length": 2896.859375, "completions/mean_terminated_length": 2093.254150390625, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 0.004779482364990291, "frac_reward_zero_std": 0.5, "grad_norm": 0.35431718826293945, "kl": 0.0013761520385742188, "learning_rate": 8.873720136518772e-07, "loss": 0.0177, "num_tokens": 5623871.0, "reward": -0.001251220703125, "reward_std": 0.4679982364177704, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.419921875, "rewards/tag_count_reward/std": 0.21946066617965698, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3461.0, "completions/mean_length": 2785.8828125, "completions/mean_terminated_length": 1852.4915771484375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.005120873962489598, "frac_reward_zero_std": 0.3125, "grad_norm": 0.4343598186969757, "kl": 0.0015163421630859375, "learning_rate": 9.556313993174062e-07, "loss": -0.0054, "num_tokens": 6001032.0, "reward": 0.002971649169921875, "reward_std": 0.6950701475143433, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.458984375, "rewards/tag_count_reward/std": 0.24658562242984772, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3468.0, "completions/mean_length": 2952.3359375, "completions/mean_terminated_length": 1899.5625, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.005462265559988905, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3521086275577545, "kl": 0.0014247894287109375, "learning_rate": 1.0238907849829352e-06, "loss": -0.0006, "num_tokens": 6399003.0, "reward": 0.00257110595703125, "reward_std": 0.628842294216156, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.365234375, "rewards/tag_count_reward/std": 0.15966975688934326, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3453.0, "completions/mean_length": 2899.1484375, "completions/mean_terminated_length": 1830.7799072265625, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "epoch": 0.005803657157488211, "frac_reward_zero_std": 0.59375, "grad_norm": 0.364692360162735, "kl": 0.0017580986022949219, "learning_rate": 1.0921501706484643e-06, "loss": -0.0019, "num_tokens": 6790306.0, "reward": 0.00452423095703125, "reward_std": 0.5271071791648865, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.408203125, "rewards/tag_count_reward/std": 0.2323940247297287, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3566.0, "completions/mean_length": 3142.2578125, "completions/mean_terminated_length": 2170.425048828125, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.006145048754987518, "frac_reward_zero_std": 0.5625, "grad_norm": 2.2038419246673584, "kl": 0.0022678375244140625, "learning_rate": 1.1604095563139933e-06, "loss": 0.0156, "num_tokens": 7214535.0, "reward": 0.003131866455078125, "reward_std": 0.46297091245651245, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.16484715044498444, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.1875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3568.0, "completions/mean_length": 3149.6015625, "completions/mean_terminated_length": 2120.76318359375, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.006486440352486824, "frac_reward_zero_std": 0.625, "grad_norm": 0.2853949964046478, "kl": 0.0016880035400390625, "learning_rate": 1.2286689419795223e-06, "loss": 0.0308, "num_tokens": 7638300.0, "reward": 0.002002716064453125, "reward_std": 0.4671037197113037, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.20557557046413422, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 2792.9765625, "completions/mean_terminated_length": 1838.2930908203125, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.006827831949986131, "frac_reward_zero_std": 0.375, "grad_norm": 0.45470118522644043, "kl": 0.0015316009521484375, "learning_rate": 1.2969283276450511e-06, "loss": 0.0087, "num_tokens": 8016385.0, "reward": 0.00186920166015625, "reward_std": 0.5223162174224854, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4296875, "rewards/tag_count_reward/std": 0.22609640657901764, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 2861.4140625, "completions/mean_terminated_length": 1902.3453369140625, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 0.007169223547485438, "frac_reward_zero_std": 0.59375, "grad_norm": 0.34896814823150635, "kl": 0.0018067359924316406, "learning_rate": 1.3651877133105804e-06, "loss": 0.0047, "num_tokens": 8402266.0, "reward": 0.000881195068359375, "reward_std": 0.4175148606300354, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.431640625, "rewards/tag_count_reward/std": 0.23626616597175598, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3577.0, "completions/mean_length": 3018.7734375, "completions/mean_terminated_length": 2165.392333984375, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 0.007510615144984744, "frac_reward_zero_std": 0.625, "grad_norm": 0.3120521008968353, "kl": 0.0018253326416015625, "learning_rate": 1.4334470989761092e-06, "loss": 0.0175, "num_tokens": 8810677.0, "reward": 0.000598907470703125, "reward_std": 0.3066524863243103, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.453125, "rewards/tag_count_reward/std": 0.28365930914878845, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3542.0, "completions/mean_length": 3091.84375, "completions/mean_terminated_length": 2324.079833984375, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 0.00785200674248405, "frac_reward_zero_std": 0.65625, "grad_norm": 0.2897898852825165, "kl": 0.002068042755126953, "learning_rate": 1.5017064846416384e-06, "loss": 0.0179, "num_tokens": 9225625.0, "reward": 0.0017852783203125, "reward_std": 0.24391448497772217, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.423828125, "rewards/tag_count_reward/std": 0.24414117634296417, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3347.0, "completions/mean_length": 2849.875, "completions/mean_terminated_length": 1495.822265625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.008193398339983357, "frac_reward_zero_std": 0.6875, "grad_norm": 0.27571403980255127, "kl": 0.0021605491638183594, "learning_rate": 1.5699658703071675e-06, "loss": 0.0145, "num_tokens": 9611593.0, "reward": 0.001026153564453125, "reward_std": 0.39203301072120667, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.458984375, "rewards/tag_count_reward/std": 0.31022176146507263, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3551.0, "completions/mean_length": 2990.140625, "completions/mean_terminated_length": 2032.69384765625, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "epoch": 0.008534789937482664, "frac_reward_zero_std": 0.375, "grad_norm": 0.40927574038505554, "kl": 0.002334117889404297, "learning_rate": 1.6382252559726963e-06, "loss": 0.0142, "num_tokens": 10014887.0, "reward": 0.002197265625, "reward_std": 0.6364043951034546, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.50390625, "rewards/tag_count_reward/std": 0.32297882437705994, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3549.0, "completions/mean_length": 2648.265625, "completions/mean_terminated_length": 1652.1612548828125, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.00887618153498197, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4048617482185364, "kl": 0.0028944015502929688, "learning_rate": 1.7064846416382255e-06, "loss": 0.0087, "num_tokens": 10374777.0, "reward": 0.001312255859375, "reward_std": 0.5847337245941162, "rewards/accuracy_reward/mean": 0.16129031777381897, "rewards/accuracy_reward/std": 0.36929062008857727, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58984375, "rewards/tag_count_reward/std": 0.3475651741027832, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 2600.390625, "completions/mean_terminated_length": 1647.046142578125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.009217573132481277, "frac_reward_zero_std": 0.53125, "grad_norm": 0.36150676012039185, "kl": 0.0032143592834472656, "learning_rate": 1.7747440273037543e-06, "loss": 0.0197, "num_tokens": 10726227.0, "reward": 0.0029449462890625, "reward_std": 0.5251610279083252, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.607421875, "rewards/tag_count_reward/std": 0.3613780438899994, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3495.0, "completions/mean_length": 2970.5859375, "completions/mean_terminated_length": 1913.4254150390625, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 0.009558964729980583, "frac_reward_zero_std": 0.65625, "grad_norm": 0.2988970875740051, "kl": 0.0028123855590820312, "learning_rate": 1.8430034129692834e-06, "loss": 0.0103, "num_tokens": 11130014.0, "reward": 0.00018310546875, "reward_std": 0.2955200672149658, "rewards/accuracy_reward/mean": 0.04032257944345474, "rewards/accuracy_reward/std": 0.19751271605491638, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5078125, "rewards/tag_count_reward/std": 0.33780425786972046, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 2866.421875, "completions/mean_terminated_length": 2170.923095703125, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 0.00990035632747989, "frac_reward_zero_std": 0.5, "grad_norm": 0.3657653033733368, "kl": 0.0032558441162109375, "learning_rate": 1.9112627986348124e-06, "loss": 0.0217, "num_tokens": 11517012.0, "reward": 0.00176239013671875, "reward_std": 0.35369873046875, "rewards/accuracy_reward/mean": 0.14516128599643707, "rewards/accuracy_reward/std": 0.3536924421787262, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.3519621193408966, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3548.0, "completions/mean_length": 2818.3046875, "completions/mean_terminated_length": 1623.8199462890625, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.010241747924979196, "frac_reward_zero_std": 0.53125, "grad_norm": 0.4341751039028168, "kl": 0.0030908584594726562, "learning_rate": 1.9795221843003416e-06, "loss": 0.0018, "num_tokens": 11899027.0, "reward": 0.000606536865234375, "reward_std": 0.309902548789978, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.3320184051990509, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3536.0, "completions/mean_length": 2732.6640625, "completions/mean_terminated_length": 1826.4031982421875, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 0.010583139522478503, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3660432696342468, "kl": 0.0032444000244140625, "learning_rate": 2.0477815699658705e-06, "loss": 0.0208, "num_tokens": 12269784.0, "reward": 0.002910614013671875, "reward_std": 0.47606462240219116, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.599609375, "rewards/tag_count_reward/std": 0.34272506833076477, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2979.0, "completions/mean_terminated_length": 2201.14306640625, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.01092453111997781, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3753754496574402, "kl": 0.002899169921875, "learning_rate": 2.1160409556313997e-06, "loss": 0.02, "num_tokens": 12671732.0, "reward": 0.00197601318359375, "reward_std": 0.5140612721443176, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.576171875, "rewards/tag_count_reward/std": 0.3529384434223175, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3503.0, "completions/mean_length": 2998.59375, "completions/mean_terminated_length": 2114.7451171875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "epoch": 0.011265922717477115, "frac_reward_zero_std": 0.65625, "grad_norm": 0.3233071565628052, "kl": 0.0030641555786132812, "learning_rate": 2.1843003412969285e-06, "loss": -0.0043, "num_tokens": 13078016.0, "reward": 0.001842498779296875, "reward_std": 0.4949609637260437, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.544921875, "rewards/tag_count_reward/std": 0.3527640998363495, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3536.0, "completions/mean_length": 2949.46875, "completions/mean_terminated_length": 2133.64306640625, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 0.011607314314976422, "frac_reward_zero_std": 0.46875, "grad_norm": 0.36739853024482727, "kl": 0.004076957702636719, "learning_rate": 2.2525597269624573e-06, "loss": 0.0289, "num_tokens": 13475144.0, "reward": 0.0015869140625, "reward_std": 0.4902920126914978, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5703125, "rewards/tag_count_reward/std": 0.3421468436717987, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 2584.8828125, "completions/mean_terminated_length": 1675.23876953125, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.011948705912475729, "frac_reward_zero_std": 0.5, "grad_norm": 0.39786967635154724, "kl": 0.004866600036621094, "learning_rate": 2.3208191126279866e-06, "loss": 0.0323, "num_tokens": 13827793.0, "reward": 0.0001678466796875, "reward_std": 0.6831117868423462, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44340085983276367, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.642578125, "rewards/tag_count_reward/std": 0.35311269760131836, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3549.0, "completions/mean_length": 2963.265625, "completions/mean_terminated_length": 1994.919921875, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 0.012290097509975036, "frac_reward_zero_std": 0.625, "grad_norm": 0.3501143157482147, "kl": 0.003452301025390625, "learning_rate": 2.3890784982935154e-06, "loss": 0.0102, "num_tokens": 14229295.0, "reward": 0.0025177001953125, "reward_std": 0.607700765132904, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.560546875, "rewards/tag_count_reward/std": 0.35459038615226746, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3577.0, "completions/mean_length": 3081.5625, "completions/mean_terminated_length": 2215.659423828125, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 0.012631489107474343, "frac_reward_zero_std": 0.53125, "grad_norm": 0.34029877185821533, "kl": 0.0034313201904296875, "learning_rate": 2.4573378839590446e-06, "loss": 0.0216, "num_tokens": 14643751.0, "reward": 0.0024261474609375, "reward_std": 0.6686677932739258, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.525390625, "rewards/tag_count_reward/std": 0.3588584065437317, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2813.6171875, "completions/mean_terminated_length": 1940.5167236328125, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "epoch": 0.012972880704973648, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3123636543750763, "kl": 0.004638671875, "learning_rate": 2.5255972696245735e-06, "loss": 0.018, "num_tokens": 15025354.0, "reward": 0.002651214599609375, "reward_std": 0.4863770604133606, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58984375, "rewards/tag_count_reward/std": 0.35733920335769653, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2845.46875, "completions/mean_terminated_length": 2034.294921875, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.013314272302472955, "frac_reward_zero_std": 0.5625, "grad_norm": 0.36341360211372375, "kl": 0.004730224609375, "learning_rate": 2.5938566552901023e-06, "loss": 0.0292, "num_tokens": 15410330.0, "reward": 0.0030975341796875, "reward_std": 0.5843051671981812, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.3561321198940277, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3524.0, "completions/mean_length": 2996.734375, "completions/mean_terminated_length": 2017.9583740234375, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 0.013655663899972262, "frac_reward_zero_std": 0.65625, "grad_norm": 0.28499290347099304, "kl": 0.004097938537597656, "learning_rate": 2.662116040955632e-06, "loss": 0.017, "num_tokens": 15815340.0, "reward": 6.103515625e-05, "reward_std": 0.3402343988418579, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5390625, "rewards/tag_count_reward/std": 0.35967710614204407, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3551.0, "completions/mean_length": 2800.5625, "completions/mean_terminated_length": 2087.283447265625, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 0.013997055497471569, "frac_reward_zero_std": 0.5, "grad_norm": 7450.0625, "kl": 4.943304061889648, "learning_rate": 2.7303754266211608e-06, "loss": 0.218, "num_tokens": 16192808.0, "reward": 0.000156402587890625, "reward_std": 0.5166099071502686, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.64453125, "rewards/tag_count_reward/std": 0.35716700553894043, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3432.0, "completions/mean_length": 2949.6328125, "completions/mean_terminated_length": 1856.3616943359375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.014338447094970876, "frac_reward_zero_std": 0.53125, "grad_norm": 0.3683777153491974, "kl": 0.0050487518310546875, "learning_rate": 2.7986348122866896e-06, "loss": 0.0032, "num_tokens": 16592981.0, "reward": 0.00202178955078125, "reward_std": 0.7033917307853699, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5390625, "rewards/tag_count_reward/std": 0.35276955366134644, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 2975.03125, "completions/mean_terminated_length": 2192.071533203125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.01467983869247018, "frac_reward_zero_std": 0.5, "grad_norm": 0.34055230021476746, "kl": 0.0054836273193359375, "learning_rate": 2.8668941979522184e-06, "loss": 0.0161, "num_tokens": 16993465.0, "reward": 0.0029144287109375, "reward_std": 0.628680944442749, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58203125, "rewards/tag_count_reward/std": 0.36869287490844727, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 2863.265625, "completions/mean_terminated_length": 2020.3729248046875, "completions/min_length": 572.0, "completions/min_terminated_length": 572.0, "epoch": 0.015021230289969488, "frac_reward_zero_std": 0.5, "grad_norm": 0.31853213906288147, "kl": 0.0065097808837890625, "learning_rate": 2.9351535836177476e-06, "loss": 0.0153, "num_tokens": 17378795.0, "reward": -0.000553131103515625, "reward_std": 0.5640521049499512, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.607421875, "rewards/tag_count_reward/std": 0.3600136637687683, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3332.0, "completions/mean_length": 2718.359375, "completions/mean_terminated_length": 1852.71875, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 0.015362621887468795, "frac_reward_zero_std": 0.5625, "grad_norm": 0.32943058013916016, "kl": 0.0060100555419921875, "learning_rate": 3.003412969283277e-06, "loss": 0.0238, "num_tokens": 17748465.0, "reward": 0.0032501220703125, "reward_std": 0.6300081610679626, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.634765625, "rewards/tag_count_reward/std": 0.37107837200164795, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 2912.53125, "completions/mean_terminated_length": 2127.254150390625, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 0.0157040134849681, "frac_reward_zero_std": 0.625, "grad_norm": 0.29115408658981323, "kl": 0.006103515625, "learning_rate": 3.0716723549488057e-06, "loss": 0.0165, "num_tokens": 18141161.0, "reward": 0.00238800048828125, "reward_std": 0.541630744934082, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.611328125, "rewards/tag_count_reward/std": 0.3709540367126465, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 2826.984375, "completions/mean_terminated_length": 2069.96875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.016045405082467407, "frac_reward_zero_std": 0.53125, "grad_norm": 0.4214458167552948, "kl": 0.007518768310546875, "learning_rate": 3.139931740614335e-06, "loss": -0.0141, "num_tokens": 18523251.0, "reward": 0.001895904541015625, "reward_std": 0.5457310080528259, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.626953125, "rewards/tag_count_reward/std": 0.371202677488327, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3390.0, "completions/mean_length": 2988.796875, "completions/mean_terminated_length": 2247.403564453125, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.016386796679966713, "frac_reward_zero_std": 0.53125, "grad_norm": 0.341756135225296, "kl": 0.006458282470703125, "learning_rate": 3.2081911262798638e-06, "loss": 0.0237, "num_tokens": 18927273.0, "reward": 0.00267791748046875, "reward_std": 0.5733039975166321, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.609375, "rewards/tag_count_reward/std": 0.36553001403808594, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3566.0, "completions/mean_length": 2800.65625, "completions/mean_terminated_length": 2017.3125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.01672818827746602, "frac_reward_zero_std": 0.625, "grad_norm": 0.3069651126861572, "kl": 0.0064334869384765625, "learning_rate": 3.2764505119453926e-06, "loss": 0.0132, "num_tokens": 19305385.0, "reward": 0.0005950927734375, "reward_std": 0.3235563039779663, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.370876282453537, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3513.0, "completions/mean_length": 2597.2578125, "completions/mean_terminated_length": 1805.08447265625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.017069579874965327, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5284867882728577, "kl": 0.007978439331054688, "learning_rate": 3.3447098976109214e-06, "loss": 0.0464, "num_tokens": 19656130.0, "reward": 0.005950927734375, "reward_std": 0.9593034982681274, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66796875, "rewards/tag_count_reward/std": 0.36869287490844727, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3363.0, "completions/mean_length": 2765.78125, "completions/mean_terminated_length": 1972.7384033203125, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.017410971472464634, "frac_reward_zero_std": 0.59375, "grad_norm": 0.31464430689811707, "kl": 0.006683349609375, "learning_rate": 3.412969283276451e-06, "loss": 0.0169, "num_tokens": 20031238.0, "reward": 0.00119781494140625, "reward_std": 0.6432621479034424, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.638671875, "rewards/tag_count_reward/std": 0.3735978901386261, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3525.0, "completions/mean_length": 2621.65625, "completions/mean_terminated_length": 1824.2857666015625, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.01775236306996394, "frac_reward_zero_std": 0.6875, "grad_norm": 0.33191078901290894, "kl": 0.0063934326171875, "learning_rate": 3.48122866894198e-06, "loss": -0.0033, "num_tokens": 20387670.0, "reward": 0.000736236572265625, "reward_std": 0.5908652544021606, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.36212730407714844, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 2866.546875, "completions/mean_terminated_length": 2078.5244140625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.018093754667463248, "frac_reward_zero_std": 0.65625, "grad_norm": 0.29009169340133667, "kl": 0.006519317626953125, "learning_rate": 3.5494880546075087e-06, "loss": 0.0087, "num_tokens": 20774408.0, "reward": 0.0001983642578125, "reward_std": 0.5209083557128906, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.619140625, "rewards/tag_count_reward/std": 0.36849987506866455, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 2656.0546875, "completions/mean_terminated_length": 1862.59423828125, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.018435146264962555, "frac_reward_zero_std": 0.65625, "grad_norm": 0.3814855217933655, "kl": 0.00759124755859375, "learning_rate": 3.617747440273038e-06, "loss": 0.0319, "num_tokens": 21136415.0, "reward": 0.003551483154296875, "reward_std": 0.38625314831733704, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.666015625, "rewards/tag_count_reward/std": 0.37024855613708496, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3449.0, "completions/mean_length": 2864.515625, "completions/mean_terminated_length": 2229.676513671875, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.018776537862461862, "frac_reward_zero_std": 0.53125, "grad_norm": 1.217660665512085, "kl": 0.007844924926757812, "learning_rate": 3.6860068259385667e-06, "loss": 0.0156, "num_tokens": 21522569.0, "reward": 0.00511932373046875, "reward_std": 0.8529055118560791, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66015625, "rewards/tag_count_reward/std": 0.3655090034008026, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.40625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3457.0, "completions/mean_length": 2440.359375, "completions/mean_terminated_length": 1682.883056640625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.019117929459961165, "frac_reward_zero_std": 0.59375, "grad_norm": 0.36298757791519165, "kl": 0.00742340087890625, "learning_rate": 3.7542662116040956e-06, "loss": 0.0238, "num_tokens": 21855515.0, "reward": 0.00156402587890625, "reward_std": 0.7557405233383179, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.720703125, "rewards/tag_count_reward/std": 0.35718318819999695, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3484.0, "completions/mean_length": 3029.109375, "completions/mean_terminated_length": 2134.48974609375, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 0.019459321057460472, "frac_reward_zero_std": 0.5625, "grad_norm": 0.30814799666404724, "kl": 0.0069561004638671875, "learning_rate": 3.822525597269625e-06, "loss": 0.0104, "num_tokens": 22264001.0, "reward": 0.000705718994140625, "reward_std": 0.5439624786376953, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.556640625, "rewards/tag_count_reward/std": 0.3634573221206665, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3557.0, "completions/mean_length": 3030.6875, "completions/mean_terminated_length": 2222.0, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "epoch": 0.01980071265495978, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3250781297683716, "kl": 0.0071563720703125, "learning_rate": 3.890784982935154e-06, "loss": 0.0012, "num_tokens": 22676293.0, "reward": 0.002506256103515625, "reward_std": 0.5304378867149353, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.56640625, "rewards/tag_count_reward/std": 0.3678576648235321, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 3007.96875, "completions/mean_terminated_length": 1908.2728271484375, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.020142104252459086, "frac_reward_zero_std": 0.6875, "grad_norm": 0.32359039783477783, "kl": 0.0066986083984375, "learning_rate": 3.959044368600683e-06, "loss": 0.0132, "num_tokens": 23079701.0, "reward": 0.004535675048828125, "reward_std": 0.4160192012786865, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.3563264012336731, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3557.0, "completions/mean_length": 2620.8984375, "completions/mean_terminated_length": 1771.1029052734375, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.020483495849958393, "frac_reward_zero_std": 0.59375, "grad_norm": 0.39053672552108765, "kl": 0.00881195068359375, "learning_rate": 4.027303754266212e-06, "loss": 0.0286, "num_tokens": 23437312.0, "reward": 0.0034332275390625, "reward_std": 0.5771381855010986, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.36913058161735535, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2830.4921875, "completions/mean_terminated_length": 2100.169189453125, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "epoch": 0.0208248874474577, "frac_reward_zero_std": 0.5, "grad_norm": 0.38198620080947876, "kl": 0.008745193481445312, "learning_rate": 4.095563139931741e-06, "loss": 0.0383, "num_tokens": 23819427.0, "reward": 0.000965118408203125, "reward_std": 0.8253244757652283, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.673828125, "rewards/tag_count_reward/std": 0.3584725260734558, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3548.0, "completions/mean_length": 2511.609375, "completions/mean_terminated_length": 1777.868408203125, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "epoch": 0.021166279044957007, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3834448754787445, "kl": 0.01055908203125, "learning_rate": 4.163822525597269e-06, "loss": 0.0171, "num_tokens": 24160673.0, "reward": 0.000576019287109375, "reward_std": 0.4872433841228485, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.705078125, "rewards/tag_count_reward/std": 0.36375337839126587, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3360.0, "completions/mean_length": 2714.171875, "completions/mean_terminated_length": 1788.2257080078125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.021507670642456313, "frac_reward_zero_std": 0.6875, "grad_norm": 0.31803035736083984, "kl": 0.00940704345703125, "learning_rate": 4.232081911262799e-06, "loss": -0.0061, "num_tokens": 24530131.0, "reward": -0.0001678466796875, "reward_std": 0.4087941646575928, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.3710213899612427, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3434.0, "completions/mean_length": 2462.765625, "completions/mean_terminated_length": 1670.4267578125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.02184906223995562, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3735447824001312, "kl": 0.010135650634765625, "learning_rate": 4.300341296928328e-06, "loss": 0.0194, "num_tokens": 24866377.0, "reward": 0.00437164306640625, "reward_std": 0.9415403604507446, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.697265625, "rewards/tag_count_reward/std": 0.366742879152298, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3359.0, "completions/mean_length": 2665.7890625, "completions/mean_terminated_length": 1688.338623046875, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "epoch": 0.022190453837454927, "frac_reward_zero_std": 0.78125, "grad_norm": 84762.484375, "kl": 15.071924209594727, "learning_rate": 4.368600682593857e-06, "loss": 0.6225, "num_tokens": 25228910.0, "reward": 0.003627777099609375, "reward_std": 0.3804410994052887, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.62109375, "rewards/tag_count_reward/std": 0.37382936477661133, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3560.0, "completions/mean_length": 2788.828125, "completions/mean_terminated_length": 1993.65625, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.02253184543495423, "frac_reward_zero_std": 0.625, "grad_norm": 2.99827241897583, "kl": 0.0113525390625, "learning_rate": 4.436860068259386e-06, "loss": 0.0109, "num_tokens": 25606496.0, "reward": 0.00197601318359375, "reward_std": 0.4626302421092987, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.3668738901615143, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 3045.7734375, "completions/mean_terminated_length": 2308.20361328125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.022873237032453538, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3501081168651581, "kl": 0.009805679321289062, "learning_rate": 4.505119453924915e-06, "loss": 0.0375, "num_tokens": 26014751.0, "reward": 0.001739501953125, "reward_std": 0.6960635185241699, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58203125, "rewards/tag_count_reward/std": 0.36869287490844727, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 3045.1953125, "completions/mean_terminated_length": 1941.9285888671875, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.023214628629952844, "frac_reward_zero_std": 0.65625, "grad_norm": 0.5201196670532227, "kl": 0.01055908203125, "learning_rate": 4.573378839590444e-06, "loss": 0.0098, "num_tokens": 26425140.0, "reward": 0.00273895263671875, "reward_std": 0.550774335861206, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5, "rewards/tag_count_reward/std": 0.3535533845424652, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3534.0, "completions/mean_length": 2964.25, "completions/mean_terminated_length": 1997.43994140625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.02355602022745215, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2548806071281433, "kl": 0.011013031005859375, "learning_rate": 4.641638225255973e-06, "loss": 0.0105, "num_tokens": 26826948.0, "reward": 0.000789642333984375, "reward_std": 0.32734501361846924, "rewards/accuracy_reward/mean": 0.05645161122083664, "rewards/accuracy_reward/std": 0.23172801733016968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.54296875, "rewards/tag_count_reward/std": 0.3673556447029114, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.09375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3502.0, "completions/mean_length": 2857.453125, "completions/mean_terminated_length": 2195.97021484375, "completions/min_length": 959.0, "completions/min_terminated_length": 959.0, "epoch": 0.023897411824951458, "frac_reward_zero_std": 0.65625, "grad_norm": 0.4868597686290741, "kl": 0.01566314697265625, "learning_rate": 4.709897610921502e-06, "loss": 0.0198, "num_tokens": 27213642.0, "reward": 0.002918243408203125, "reward_std": 0.6740466952323914, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.646484375, "rewards/tag_count_reward/std": 0.3745434880256653, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 2848.8125, "completions/mean_terminated_length": 1961.5172119140625, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "epoch": 0.024238803422450765, "frac_reward_zero_std": 0.8125, "grad_norm": 0.24871806800365448, "kl": 0.011508941650390625, "learning_rate": 4.778156996587031e-06, "loss": 0.0006, "num_tokens": 27598602.0, "reward": 0.001068115234375, "reward_std": 0.19123640656471252, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.3683590292930603, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3434.0, "completions/mean_length": 2854.3828125, "completions/mean_terminated_length": 1973.810302734375, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "epoch": 0.024580195019950072, "frac_reward_zero_std": 0.625, "grad_norm": 0.3559815287590027, "kl": 0.0116729736328125, "learning_rate": 4.84641638225256e-06, "loss": 0.0126, "num_tokens": 27986775.0, "reward": 0.002445220947265625, "reward_std": 0.42418408393859863, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.60546875, "rewards/tag_count_reward/std": 0.37068963050842285, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 2677.515625, "completions/mean_terminated_length": 1712.54833984375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.02492158661744938, "frac_reward_zero_std": 0.75, "grad_norm": 0.24723176658153534, "kl": 0.012844085693359375, "learning_rate": 4.914675767918089e-06, "loss": 0.0069, "num_tokens": 28352365.0, "reward": 0.00041961669921875, "reward_std": 0.2619178295135498, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.625, "rewards/tag_count_reward/std": 0.3725312352180481, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.1875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 2620.2109375, "completions/mean_terminated_length": 1821.642822265625, "completions/min_length": 479.0, "completions/min_terminated_length": 479.0, "epoch": 0.025262978214948686, "frac_reward_zero_std": 0.625, "grad_norm": 0.3266037404537201, "kl": 0.015506744384765625, "learning_rate": 4.982935153583618e-06, "loss": -0.0043, "num_tokens": 28707500.0, "reward": 0.00225830078125, "reward_std": 0.43055176734924316, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.67578125, "rewards/tag_count_reward/std": 0.36769041419029236, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 3015.3828125, "completions/mean_terminated_length": 2284.3037109375, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "epoch": 0.025604369812447993, "frac_reward_zero_std": 0.5, "grad_norm": 0.36074161529541016, "kl": 0.01390838623046875, "learning_rate": 5.051194539249147e-06, "loss": 0.0255, "num_tokens": 29114749.0, "reward": 0.006420135498046875, "reward_std": 0.7547076940536499, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.609375, "rewards/tag_count_reward/std": 0.36553001403808594, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3411.0, "completions/mean_length": 2820.640625, "completions/mean_terminated_length": 1740.4151611328125, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "epoch": 0.025945761409947296, "frac_reward_zero_std": 0.625, "grad_norm": 0.36523059010505676, "kl": 0.014423370361328125, "learning_rate": 5.119453924914676e-06, "loss": -0.0035, "num_tokens": 29494491.0, "reward": 0.001819610595703125, "reward_std": 0.5952407121658325, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59765625, "rewards/tag_count_reward/std": 0.36212730407714844, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3449.0, "completions/mean_length": 3054.53125, "completions/mean_terminated_length": 1931.0242919921875, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.026287153007446603, "frac_reward_zero_std": 0.75, "grad_norm": 16.166582107543945, "kl": 0.02136993408203125, "learning_rate": 5.1877133105802046e-06, "loss": 0.0093, "num_tokens": 29906127.0, "reward": 0.00072479248046875, "reward_std": 0.21554121375083923, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.517578125, "rewards/tag_count_reward/std": 0.35101595520973206, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3383.0, "completions/mean_length": 2751.3671875, "completions/mean_terminated_length": 1714.22802734375, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.02662854460494591, "frac_reward_zero_std": 0.65625, "grad_norm": 9.045215606689453, "kl": 0.016956329345703125, "learning_rate": 5.255972696245735e-06, "loss": 0.0206, "num_tokens": 30279870.0, "reward": 0.000457763671875, "reward_std": 0.3662906885147095, "rewards/accuracy_reward/mean": 0.17741934955120087, "rewards/accuracy_reward/std": 0.3835729956626892, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.611328125, "rewards/tag_count_reward/std": 0.366952508687973, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 3106.0546875, "completions/mean_terminated_length": 2335.48974609375, "completions/min_length": 575.0, "completions/min_terminated_length": 575.0, "epoch": 0.026969936202445217, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3251168131828308, "kl": 0.01470184326171875, "learning_rate": 5.324232081911264e-06, "loss": 0.0249, "num_tokens": 30698881.0, "reward": 0.004150390625, "reward_std": 0.4456140995025635, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.194504976272583, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.552734375, "rewards/tag_count_reward/std": 0.3640492260456085, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 2823.6875, "completions/mean_terminated_length": 2109.45458984375, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "epoch": 0.027311327799944524, "frac_reward_zero_std": 0.625, "grad_norm": 3.5720200538635254, "kl": 0.019832611083984375, "learning_rate": 5.392491467576792e-06, "loss": 0.009, "num_tokens": 31082637.0, "reward": 0.04381752014160156, "reward_std": 0.6415413618087769, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.66015625, "rewards/tag_count_reward/std": 0.36685293912887573, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3388.0, "completions/mean_length": 2573.1015625, "completions/mean_terminated_length": 1708.710205078125, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.02765271939744383, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2961677312850952, "kl": 0.0181427001953125, "learning_rate": 5.4607508532423215e-06, "loss": 0.0047, "num_tokens": 31431234.0, "reward": 0.000293731689453125, "reward_std": 0.3574065864086151, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.69921875, "rewards/tag_count_reward/std": 0.35126230120658875, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3560.0, "completions/mean_length": 3171.34375, "completions/mean_terminated_length": 2506.040771484375, "completions/min_length": 924.0, "completions/min_terminated_length": 924.0, "epoch": 0.027994110994943137, "frac_reward_zero_std": 0.34375, "grad_norm": 0.37972211837768555, "kl": 0.015811920166015625, "learning_rate": 5.529010238907851e-06, "loss": 0.0141, "num_tokens": 31858298.0, "reward": 0.0066375732421875, "reward_std": 1.0201246738433838, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.560546875, "rewards/tag_count_reward/std": 0.3628220558166504, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.28125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3485.0, "completions/mean_length": 3118.8984375, "completions/mean_terminated_length": 2131.9755859375, "completions/min_length": 1028.0, "completions/min_terminated_length": 1028.0, "epoch": 0.028335502592442444, "frac_reward_zero_std": 0.71875, "grad_norm": 0.24274705350399017, "kl": 0.01612091064453125, "learning_rate": 5.597269624573379e-06, "loss": 0.0123, "num_tokens": 32277333.0, "reward": 0.001544952392578125, "reward_std": 0.29567790031433105, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.509765625, "rewards/tag_count_reward/std": 0.3513225018978119, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3564.0, "completions/mean_length": 3001.4609375, "completions/mean_terminated_length": 2150.057861328125, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.02867689418994175, "frac_reward_zero_std": 0.5, "grad_norm": 0.34303712844848633, "kl": 0.01653289794921875, "learning_rate": 5.665529010238908e-06, "loss": 0.015, "num_tokens": 32681460.0, "reward": -0.00011444091796875, "reward_std": 0.6853011846542358, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59375, "rewards/tag_count_reward/std": 0.3618086576461792, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3431.0, "completions/mean_length": 2987.8828125, "completions/mean_terminated_length": 2144.32080078125, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "epoch": 0.029018285787441058, "frac_reward_zero_std": 0.59375, "grad_norm": 0.3093225657939911, "kl": 0.01650238037109375, "learning_rate": 5.733788395904437e-06, "loss": 0.0252, "num_tokens": 33084149.0, "reward": 0.003437042236328125, "reward_std": 0.5121632218360901, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.576171875, "rewards/tag_count_reward/std": 0.3679569661617279, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3540.0, "completions/mean_length": 2982.4296875, "completions/mean_terminated_length": 2131.15087890625, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.02935967738494036, "frac_reward_zero_std": 0.59375, "grad_norm": 3.8639724254608154, "kl": 0.019775390625, "learning_rate": 5.802047781569966e-06, "loss": 0.0107, "num_tokens": 33488488.0, "reward": 0.0023956298828125, "reward_std": 0.6331969499588013, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.572265625, "rewards/tag_count_reward/std": 0.3687502145767212, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3520.0, "completions/mean_length": 2864.4765625, "completions/mean_terminated_length": 1778.1373291015625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.02970106898243967, "frac_reward_zero_std": 0.625, "grad_norm": 1.4566076057590432e+19, "kl": 5.566222640034611e+16, "learning_rate": 5.870307167235495e-06, "loss": 2228619338317824.0, "num_tokens": 33876073.0, "reward": 0.001132965087890625, "reward_std": 0.5009557008743286, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.568359375, "rewards/tag_count_reward/std": 0.3654826879501343, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3444.0, "completions/mean_length": 3022.6875, "completions/mean_terminated_length": 2228.37744140625, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 0.030042460579938975, "frac_reward_zero_std": 0.40625, "grad_norm": 199518584832.0, "kl": 1224802304.0164566, "learning_rate": 5.938566552901024e-06, "loss": 48926848.0, "num_tokens": 34285993.0, "reward": 0.0205838680267334, "reward_std": 0.8989654183387756, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58984375, "rewards/tag_count_reward/std": 0.36280617117881775, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2938.1015625, "completions/mean_terminated_length": 2228.672119140625, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 0.030383852177438282, "frac_reward_zero_std": 0.59375, "grad_norm": 1.6419837474822998, "kl": 0.02252960205078125, "learning_rate": 6.006825938566554e-06, "loss": 0.0135, "num_tokens": 34681762.0, "reward": 0.00408172607421875, "reward_std": 0.5849268436431885, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.626953125, "rewards/tag_count_reward/std": 0.3698745369911194, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3514.0, "completions/mean_length": 2885.09375, "completions/mean_terminated_length": 2164.000244140625, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.03072524377493759, "frac_reward_zero_std": 0.71875, "grad_norm": 0.34640344977378845, "kl": 0.02391815185546875, "learning_rate": 6.075085324232083e-06, "loss": -0.0011, "num_tokens": 35071098.0, "reward": 0.00148773193359375, "reward_std": 0.38955438137054443, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.36212730407714844, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3289.0, "completions/mean_length": 2874.0546875, "completions/mean_terminated_length": 1931.7635498046875, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 0.031066635372436896, "frac_reward_zero_std": 0.4375, "grad_norm": 0.41620391607284546, "kl": 0.021392822265625, "learning_rate": 6.143344709897611e-06, "loss": 0.0514, "num_tokens": 35459081.0, "reward": 0.002918243408203125, "reward_std": 0.7349972724914551, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.607421875, "rewards/tag_count_reward/std": 0.3640914559364319, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3455.0, "completions/mean_length": 3117.6875, "completions/mean_terminated_length": 2195.906982421875, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.0314080269699362, "frac_reward_zero_std": 0.65625, "grad_norm": 3539.251220703125, "kl": 35.76856231689453, "learning_rate": 6.211604095563141e-06, "loss": 1.4496, "num_tokens": 35880541.0, "reward": 0.0396728515625, "reward_std": 0.6079472899436951, "rewards/accuracy_reward/mean": 0.13333334028720856, "rewards/accuracy_reward/std": 0.3413599729537964, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.537109375, "rewards/tag_count_reward/std": 0.3536784052848816, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 2785.484375, "completions/mean_terminated_length": 1908.4261474609375, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "epoch": 0.03174941856743551, "frac_reward_zero_std": 0.65625, "grad_norm": 0.3327024579048157, "kl": 0.024322509765625, "learning_rate": 6.27986348122867e-06, "loss": 0.0079, "num_tokens": 36255647.0, "reward": 0.00299072265625, "reward_std": 0.5673510432243347, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6171875, "rewards/tag_count_reward/std": 0.3711249828338623, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3450.0, "completions/mean_length": 2925.203125, "completions/mean_terminated_length": 1863.0611572265625, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.03209081016493481, "frac_reward_zero_std": 0.71875, "grad_norm": 2.758816719055176, "kl": 0.02474212646484375, "learning_rate": 6.348122866894198e-06, "loss": 0.0092, "num_tokens": 36650769.0, "reward": 0.00435638427734375, "reward_std": 0.4180721640586853, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.548828125, "rewards/tag_count_reward/std": 0.36459797620773315, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 3016.6171875, "completions/mean_terminated_length": 2038.787109375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.032432201762434124, "frac_reward_zero_std": 0.65625, "grad_norm": 0.577880322933197, "kl": 0.031463623046875, "learning_rate": 6.4163822525597275e-06, "loss": 0.0101, "num_tokens": 37058572.0, "reward": 0.0013885498046875, "reward_std": 0.4550042748451233, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.548828125, "rewards/tag_count_reward/std": 0.360525906085968, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 2883.828125, "completions/mean_terminated_length": 2038.7930908203125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.03277359335993343, "frac_reward_zero_std": 0.65625, "grad_norm": 60.52473449707031, "kl": 0.088592529296875, "learning_rate": 6.484641638225257e-06, "loss": 0.0154, "num_tokens": 37446438.0, "reward": 1.1444091796875e-05, "reward_std": 0.4328586459159851, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6171875, "rewards/tag_count_reward/std": 0.36712533235549927, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3519.0, "completions/mean_length": 2793.6484375, "completions/mean_terminated_length": 1952.306396484375, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 0.03311498495743274, "frac_reward_zero_std": 0.53125, "grad_norm": 1.158890962600708, "kl": 0.0261383056640625, "learning_rate": 6.552901023890785e-06, "loss": 0.0202, "num_tokens": 37824317.0, "reward": 0.0026397705078125, "reward_std": 0.7154542207717896, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.64453125, "rewards/tag_count_reward/std": 0.3653406500816345, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.34375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3164.0, "completions/mean_length": 3023.2578125, "completions/mean_terminated_length": 1914.81396484375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.03345637655493204, "frac_reward_zero_std": 0.53125, "grad_norm": 0.340549111366272, "kl": 0.0261688232421875, "learning_rate": 6.621160409556314e-06, "loss": 0.004, "num_tokens": 38232798.0, "reward": 0.000213623046875, "reward_std": 0.5956844687461853, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.533203125, "rewards/tag_count_reward/std": 0.3540695309638977, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3564.0, "completions/mean_length": 2870.015625, "completions/mean_terminated_length": 2008.310302734375, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.03379776815243135, "frac_reward_zero_std": 0.625, "grad_norm": 1.704431414604187, "kl": 0.03084564208984375, "learning_rate": 6.689419795221843e-06, "loss": 0.0063, "num_tokens": 38620660.0, "reward": 0.005283355712890625, "reward_std": 0.7465252876281738, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.59765625, "rewards/tag_count_reward/std": 0.3728407323360443, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3421.0, "completions/mean_length": 2759.8359375, "completions/mean_terminated_length": 1854.6064453125, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.034139159749930655, "frac_reward_zero_std": 0.53125, "grad_norm": 1.7370818853378296, "kl": 0.03079986572265625, "learning_rate": 6.757679180887372e-06, "loss": 0.0112, "num_tokens": 38994483.0, "reward": 0.00588226318359375, "reward_std": 0.6292628645896912, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.626953125, "rewards/tag_count_reward/std": 0.3698745369911194, "step": 100 } ], "logging_steps": 1, "max_steps": 2930, "num_input_tokens_seen": 38994483, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }