{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06827686267815994, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3556.0, "completions/mean_length": 2988.1640625, "completions/mean_terminated_length": 2171.648193359375, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "epoch": 0.0006827686267815994, "frac_reward_zero_std": 0.71875, "grad_norm": 4.210491180419922, "kl": 0.0044651031494140625, "learning_rate": 0.0, "loss": 0.0223, "num_tokens": 402745.0, "reward": 0.6328125, "reward_std": 0.17125242948532104, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3984375, "rewards/tag_count_reward/std": 0.19950109720230103, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3405.0, "completions/mean_length": 2921.2265625, "completions/mean_terminated_length": 1983.339599609375, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "epoch": 0.0013655372535631989, "frac_reward_zero_std": 0.765625, "grad_norm": 0.2005176693201065, "kl": 0.0014462471008300781, "learning_rate": 1.360544217687075e-07, "loss": 0.0107, "num_tokens": 798226.0, "reward": 0.75390625, "reward_std": 0.22097086906433105, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.16596287488937378, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3556.0, "completions/mean_length": 2930.5234375, "completions/mean_terminated_length": 1804.319091796875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.002048305880344798, "frac_reward_zero_std": 0.734375, "grad_norm": 0.2179957926273346, "kl": 0.0014863014221191406, "learning_rate": 2.72108843537415e-07, "loss": 0.0023, "num_tokens": 1195691.0, "reward": 0.6171875, "reward_std": 0.18782521784305573, "rewards/accuracy_reward/mean": 0.1190476194024086, "rewards/accuracy_reward/std": 0.32513731718063354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.1658238172531128, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3570.0, "completions/mean_length": 3037.4453125, "completions/mean_terminated_length": 2238.634765625, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.0027310745071263977, "frac_reward_zero_std": 0.65625, "grad_norm": 0.22335194051265717, "kl": 0.0013914108276367188, "learning_rate": 4.0816326530612243e-07, "loss": 0.0033, "num_tokens": 1604400.0, "reward": 0.703125, "reward_std": 0.2154465913772583, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.37416577339172363, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.15686394274234772, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3536.0, "completions/mean_length": 2955.5546875, "completions/mean_terminated_length": 1835.2825927734375, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.003413843133907997, "frac_reward_zero_std": 0.796875, "grad_norm": 0.18281564116477966, "kl": 0.0014123916625976562, "learning_rate": 5.4421768707483e-07, "loss": 0.0128, "num_tokens": 2002207.0, "reward": 0.51171875, "reward_std": 0.1657281517982483, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.13533776998519897, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3478.0, "completions/mean_length": 3012.984375, "completions/mean_terminated_length": 2178.423095703125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.004096611760689596, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2494056522846222, "kl": 0.0015425682067871094, "learning_rate": 6.802721088435376e-07, "loss": 0.0058, "num_tokens": 2407519.0, "reward": 0.724609375, "reward_std": 0.16296601295471191, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.396484375, "rewards/tag_count_reward/std": 0.16460207104682922, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3360.0, "completions/mean_length": 2910.0859375, "completions/mean_terminated_length": 1823.5714111328125, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.0047793803874711955, "frac_reward_zero_std": 0.765625, "grad_norm": 0.2239646017551422, "kl": 0.0014190673828125, "learning_rate": 8.163265306122449e-07, "loss": 0.0078, "num_tokens": 2800424.0, "reward": 0.73046875, "reward_std": 0.1657281517982483, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40234375, "rewards/tag_count_reward/std": 0.20865705609321594, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 2896.5625, "completions/mean_terminated_length": 1891.84619140625, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.0054621490142527955, "frac_reward_zero_std": 0.71875, "grad_norm": 0.1919669806957245, "kl": 0.0014629364013671875, "learning_rate": 9.523809523809525e-07, "loss": -0.0116, "num_tokens": 3191504.0, "reward": 0.685546875, "reward_std": 0.21820873022079468, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.404296875, "rewards/tag_count_reward/std": 0.20118030905723572, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3487.0, "completions/mean_length": 3061.140625, "completions/mean_terminated_length": 1990.5238037109375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "epoch": 0.006144917641034395, "frac_reward_zero_std": 0.71875, "grad_norm": 0.22426915168762207, "kl": 0.0014653205871582031, "learning_rate": 1.08843537414966e-06, "loss": 0.0131, "num_tokens": 3604446.0, "reward": 0.689453125, "reward_std": 0.1905873715877533, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.20087429881095886, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3511.0, "completions/mean_length": 2943.609375, "completions/mean_terminated_length": 1839.9573974609375, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 0.006827686267815994, "frac_reward_zero_std": 0.78125, "grad_norm": 0.18735478818416595, "kl": 0.0014424324035644531, "learning_rate": 1.2244897959183673e-06, "loss": 0.0051, "num_tokens": 4001828.0, "reward": 0.6171875, "reward_std": 0.13258251547813416, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.16876548528671265, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 2931.4375, "completions/mean_terminated_length": 2065.30908203125, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 0.007510454894597594, "frac_reward_zero_std": 0.75, "grad_norm": 0.22192414104938507, "kl": 0.0014195442199707031, "learning_rate": 1.3605442176870751e-06, "loss": 0.0151, "num_tokens": 4397866.0, "reward": 0.759765625, "reward_std": 0.17401456832885742, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.400390625, "rewards/tag_count_reward/std": 0.19170762598514557, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3574.0, "completions/mean_length": 3023.3359375, "completions/mean_terminated_length": 1875.3095703125, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.008193223521379193, "frac_reward_zero_std": 0.859375, "grad_norm": 0.15188324451446533, "kl": 0.0015492439270019531, "learning_rate": 1.4965986394557825e-06, "loss": 0.002, "num_tokens": 4805041.0, "reward": 0.60546875, "reward_std": 0.09943689405918121, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.19079293310642242, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3533.0, "completions/mean_length": 2855.375, "completions/mean_terminated_length": 1918.571533203125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.008875992148160792, "frac_reward_zero_std": 0.640625, "grad_norm": 0.24861852824687958, "kl": 0.0015034675598144531, "learning_rate": 1.6326530612244897e-06, "loss": 0.0011, "num_tokens": 5191263.0, "reward": 0.73828125, "reward_std": 0.24859221279621124, "rewards/accuracy_reward/mean": 0.1587301641702652, "rewards/accuracy_reward/std": 0.3668830394744873, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.42578125, "rewards/tag_count_reward/std": 0.22701281309127808, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 2844.3125, "completions/mean_terminated_length": 1922.9473876953125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.009558760774942391, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2638668417930603, "kl": 0.0016384124755859375, "learning_rate": 1.7687074829931975e-06, "loss": 0.0051, "num_tokens": 5576411.0, "reward": 0.58203125, "reward_std": 0.204398050904274, "rewards/accuracy_reward/mean": 0.0634920671582222, "rewards/accuracy_reward/std": 0.24481934309005737, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.45703125, "rewards/tag_count_reward/std": 0.2626885771751404, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 2938.796875, "completions/mean_terminated_length": 1863.4583740234375, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.01024152940172399, "frac_reward_zero_std": 0.65625, "grad_norm": 0.2638472020626068, "kl": 0.0017695426940917969, "learning_rate": 1.904761904761905e-06, "loss": 0.0154, "num_tokens": 5973261.0, "reward": 0.673828125, "reward_std": 0.14639319479465485, "rewards/accuracy_reward/mean": 0.1190476194024086, "rewards/accuracy_reward/std": 0.32513731718063354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.439453125, "rewards/tag_count_reward/std": 0.263878732919693, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 2893.3671875, "completions/mean_terminated_length": 1976.708984375, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 0.010924298028505591, "frac_reward_zero_std": 0.65625, "grad_norm": 0.2649959623813629, "kl": 0.002013683319091797, "learning_rate": 2.0408163265306125e-06, "loss": 0.0044, "num_tokens": 6364418.0, "reward": 0.73046875, "reward_std": 0.16020387411117554, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.49609375, "rewards/tag_count_reward/std": 0.30896127223968506, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3543.0, "completions/mean_length": 2988.890625, "completions/mean_terminated_length": 1963.2764892578125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "epoch": 0.01160706665528719, "frac_reward_zero_std": 0.671875, "grad_norm": 0.25358450412750244, "kl": 0.0021944046020507812, "learning_rate": 2.17687074829932e-06, "loss": 0.0106, "num_tokens": 6768026.0, "reward": 0.671875, "reward_std": 0.24306795001029968, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.29262590408325195, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3450.0, "completions/mean_length": 2786.0859375, "completions/mean_terminated_length": 1852.9322509765625, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.01228983528206879, "frac_reward_zero_std": 0.671875, "grad_norm": 0.8762544393539429, "kl": 0.007658958435058594, "learning_rate": 2.3129251700680273e-06, "loss": 0.0293, "num_tokens": 7146639.0, "reward": 0.873046875, "reward_std": 0.2513543963432312, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.529296875, "rewards/tag_count_reward/std": 0.31935757398605347, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3565.0, "completions/mean_length": 2964.3203125, "completions/mean_terminated_length": 2028.7255859375, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 0.012972603908850388, "frac_reward_zero_std": 0.671875, "grad_norm": 0.3241795301437378, "kl": 0.0024280548095703125, "learning_rate": 2.4489795918367347e-06, "loss": 0.0135, "num_tokens": 7546810.0, "reward": 0.724609375, "reward_std": 0.229257270693779, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.505859375, "rewards/tag_count_reward/std": 0.32065513730049133, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 2949.1171875, "completions/mean_terminated_length": 2050.6982421875, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "epoch": 0.013655372535631987, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2362036257982254, "kl": 0.0020618438720703125, "learning_rate": 2.5850340136054425e-06, "loss": 0.0129, "num_tokens": 7945389.0, "reward": 0.70703125, "reward_std": 0.204398050904274, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.45703125, "rewards/tag_count_reward/std": 0.28254568576812744, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3466.0, "completions/mean_length": 2973.65625, "completions/mean_terminated_length": 2021.5198974609375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.014338141162413586, "frac_reward_zero_std": 0.6875, "grad_norm": 0.25698938965797424, "kl": 0.0022821426391601562, "learning_rate": 2.7210884353741503e-06, "loss": 0.0099, "num_tokens": 8346825.0, "reward": 0.72265625, "reward_std": 0.1657281517982483, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.44140625, "rewards/tag_count_reward/std": 0.265252023935318, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 3000.6796875, "completions/mean_terminated_length": 2090.699951171875, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.015020909789195187, "frac_reward_zero_std": 0.703125, "grad_norm": 0.2372422218322754, "kl": 0.002299785614013672, "learning_rate": 2.8571428571428573e-06, "loss": 0.0103, "num_tokens": 8750168.0, "reward": 0.75, "reward_std": 0.24306795001029968, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.25679734349250793, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3540.0, "completions/mean_length": 2941.484375, "completions/mean_terminated_length": 2141.157958984375, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 0.015703678415976786, "frac_reward_zero_std": 0.78125, "grad_norm": 0.19502384960651398, "kl": 0.0023593902587890625, "learning_rate": 2.993197278911565e-06, "loss": 0.001, "num_tokens": 9147484.0, "reward": 0.705078125, "reward_std": 0.1132475733757019, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.439453125, "rewards/tag_count_reward/std": 0.24851150810718536, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3510.0, "completions/mean_length": 3083.6640625, "completions/mean_terminated_length": 2160.822265625, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 0.016386447042758386, "frac_reward_zero_std": 0.71875, "grad_norm": 5267.92138671875, "kl": 11.689858436584473, "learning_rate": 3.1292517006802725e-06, "loss": 0.4802, "num_tokens": 9563039.0, "reward": 0.529296875, "reward_std": 0.12429611384868622, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.388671875, "rewards/tag_count_reward/std": 0.20765040814876556, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3512.0, "completions/mean_length": 2820.1875, "completions/mean_terminated_length": 2007.0966796875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.017069215669539985, "frac_reward_zero_std": 0.71875, "grad_norm": 0.23362644016742706, "kl": 0.002544403076171875, "learning_rate": 3.2653061224489794e-06, "loss": 0.016, "num_tokens": 9942985.0, "reward": 0.71875, "reward_std": 0.1657281517982483, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.421875, "rewards/tag_count_reward/std": 0.20027048885822296, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 2889.15625, "completions/mean_terminated_length": 2076.54248046875, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "epoch": 0.017751984296321584, "frac_reward_zero_std": 0.765625, "grad_norm": 0.23670873045921326, "kl": 0.0028171539306640625, "learning_rate": 3.4013605442176872e-06, "loss": 0.0009, "num_tokens": 10333771.0, "reward": 0.6875, "reward_std": 0.1657281517982483, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4375, "rewards/tag_count_reward/std": 0.24502933025360107, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 2818.9375, "completions/mean_terminated_length": 1951.8668212890625, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.018434752923103183, "frac_reward_zero_std": 0.703125, "grad_norm": 0.2265295833349228, "kl": 0.0028820037841796875, "learning_rate": 3.537414965986395e-06, "loss": 0.0234, "num_tokens": 10715521.0, "reward": 0.681640625, "reward_std": 0.27345144748687744, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.400390625, "rewards/tag_count_reward/std": 0.18114857375621796, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.15625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3509.0, "completions/mean_length": 2695.1953125, "completions/mean_terminated_length": 1935.202880859375, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "epoch": 0.019117521549884782, "frac_reward_zero_std": 0.71875, "grad_norm": 0.24095378816127777, "kl": 0.003009796142578125, "learning_rate": 3.6734693877551024e-06, "loss": -0.0067, "num_tokens": 11080544.0, "reward": 0.857421875, "reward_std": 0.18506309390068054, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.451171875, "rewards/tag_count_reward/std": 0.2152150422334671, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3532.0, "completions/mean_length": 3142.921875, "completions/mean_terminated_length": 2239.761962890625, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "epoch": 0.01980029017666638, "frac_reward_zero_std": 0.734375, "grad_norm": 0.22904202342033386, "kl": 0.0031566619873046875, "learning_rate": 3.80952380952381e-06, "loss": 0.0059, "num_tokens": 11505400.0, "reward": 0.55078125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.1770809292793274, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3551.0, "completions/mean_length": 2830.0234375, "completions/mean_terminated_length": 1829.2908935546875, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 0.02048305880344798, "frac_reward_zero_std": 0.875, "grad_norm": 0.2996601462364197, "kl": 0.0038623809814453125, "learning_rate": 3.945578231292517e-06, "loss": -0.0056, "num_tokens": 11887905.0, "reward": 0.689453125, "reward_std": 0.06905338913202286, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.14713835716247559, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 2802.984375, "completions/mean_terminated_length": 1889.59326171875, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 0.02116582743022958, "frac_reward_zero_std": 0.796875, "grad_norm": 0.19929054379463196, "kl": 0.0031557083129882812, "learning_rate": 4.081632653061225e-06, "loss": 0.0097, "num_tokens": 12266473.0, "reward": 0.78515625, "reward_std": 0.14915534853935242, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.16780593991279602, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 2698.1484375, "completions/mean_terminated_length": 1784.1746826171875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.021848596057011182, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2182374894618988, "kl": 0.0033788681030273438, "learning_rate": 4.217687074829933e-06, "loss": 0.0083, "num_tokens": 12633364.0, "reward": 0.6953125, "reward_std": 0.10496116429567337, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3984375, "rewards/tag_count_reward/std": 0.14182965457439423, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3466.0, "completions/mean_length": 2816.75, "completions/mean_terminated_length": 1798.39990234375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.02253136468379278, "frac_reward_zero_std": 0.703125, "grad_norm": 0.19947372376918793, "kl": 0.003604888916015625, "learning_rate": 4.35374149659864e-06, "loss": 0.0065, "num_tokens": 13014872.0, "reward": 0.634765625, "reward_std": 0.2016359269618988, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.400390625, "rewards/tag_count_reward/std": 0.18912313878536224, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.25, "completions/max_length": 3584.0, "completions/max_terminated_length": 3524.0, "completions/mean_length": 3136.765625, "completions/mean_terminated_length": 2152.85009765625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.02321413331057438, "frac_reward_zero_std": 0.90625, "grad_norm": 0.1304818093776703, "kl": 0.0037279129028320312, "learning_rate": 4.489795918367348e-06, "loss": 0.0039, "num_tokens": 13435878.0, "reward": 0.59765625, "reward_std": 0.07733979821205139, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33203125, "rewards/tag_count_reward/std": 0.1178438812494278, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3455.0, "completions/mean_length": 2962.3359375, "completions/mean_terminated_length": 2082.62255859375, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.02389690193735598, "frac_reward_zero_std": 0.796875, "grad_norm": 0.20229387283325195, "kl": 0.004085540771484375, "learning_rate": 4.6258503401360546e-06, "loss": 0.0092, "num_tokens": 13836719.0, "reward": 0.603515625, "reward_std": 0.10772329568862915, "rewards/accuracy_reward/mean": 0.1190476194024086, "rewards/accuracy_reward/std": 0.32513731718063354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.15675361454486847, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3520.0, "completions/mean_length": 2883.203125, "completions/mean_terminated_length": 1953.054443359375, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.02457967056413758, "frac_reward_zero_std": 0.796875, "grad_norm": 0.19419854879379272, "kl": 0.004353523254394531, "learning_rate": 4.761904761904762e-06, "loss": 0.002, "num_tokens": 14227331.0, "reward": 0.591796875, "reward_std": 0.09115047752857208, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.404296875, "rewards/tag_count_reward/std": 0.21073804795742035, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.03125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3514.0, "completions/mean_length": 2679.1796875, "completions/mean_terminated_length": 1802.199951171875, "completions/min_length": 397.0, "completions/min_terminated_length": 397.0, "epoch": 0.025262439190919177, "frac_reward_zero_std": 0.796875, "grad_norm": 0.47850948572158813, "kl": 0.004796028137207031, "learning_rate": 4.897959183673469e-06, "loss": -0.0019, "num_tokens": 14591574.0, "reward": 0.673828125, "reward_std": 0.09115047752857208, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.15586812794208527, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3473.0, "completions/mean_length": 2937.609375, "completions/mean_terminated_length": 1961.6864013671875, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "epoch": 0.025945207817700777, "frac_reward_zero_std": 0.75, "grad_norm": 0.20070995390415192, "kl": 0.004593849182128906, "learning_rate": 5.034013605442177e-06, "loss": -0.0006, "num_tokens": 14987578.0, "reward": 0.703125, "reward_std": 0.23754367232322693, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.1471514254808426, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3409.0, "completions/mean_length": 2915.1328125, "completions/mean_terminated_length": 1836.7550048828125, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.026627976444482376, "frac_reward_zero_std": 0.796875, "grad_norm": 1.0198947191238403, "kl": 0.00588226318359375, "learning_rate": 5.170068027210885e-06, "loss": 0.0085, "num_tokens": 15381827.0, "reward": 0.615234375, "reward_std": 0.12982037663459778, "rewards/accuracy_reward/mean": 0.1269841343164444, "rewards/accuracy_reward/std": 0.33428433537483215, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.365234375, "rewards/tag_count_reward/std": 0.14682446420192719, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 2914.0625, "completions/mean_terminated_length": 1934.923095703125, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 0.027310745071263975, "frac_reward_zero_std": 0.78125, "grad_norm": 1.6164634227752686, "kl": 0.0067596435546875, "learning_rate": 5.306122448979593e-06, "loss": 0.0136, "num_tokens": 15776707.0, "reward": 0.6171875, "reward_std": 0.14915533363819122, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.18276500701904297, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3393.0, "completions/mean_length": 2885.84375, "completions/mean_terminated_length": 2016.2105712890625, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.027993513698045574, "frac_reward_zero_std": 0.703125, "grad_norm": 0.26528334617614746, "kl": 0.0051326751708984375, "learning_rate": 5.442176870748301e-06, "loss": -0.0034, "num_tokens": 16166281.0, "reward": 0.783203125, "reward_std": 0.23478154838085175, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.1737825870513916, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3570.0, "completions/mean_length": 2983.546875, "completions/mean_terminated_length": 2015.4693603515625, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 0.028676282324827173, "frac_reward_zero_std": 0.84375, "grad_norm": 0.19506986439228058, "kl": 0.005588531494140625, "learning_rate": 5.578231292517007e-06, "loss": 0.0106, "num_tokens": 16568053.0, "reward": 0.625, "reward_std": 0.13258251547813416, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.19779790937900543, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3349.0, "completions/mean_length": 2984.0390625, "completions/mean_terminated_length": 1984.104248046875, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 0.029359050951608772, "frac_reward_zero_std": 0.765625, "grad_norm": 0.24852561950683594, "kl": 0.005970001220703125, "learning_rate": 5.7142857142857145e-06, "loss": 0.0063, "num_tokens": 16971424.0, "reward": 0.677734375, "reward_std": 0.12429611384868622, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.19833172857761383, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 2846.1796875, "completions/mean_terminated_length": 1835.0926513671875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.030041819578390375, "frac_reward_zero_std": 0.78125, "grad_norm": 94440972288.0, "kl": 395313152.0054207, "learning_rate": 5.850340136054422e-06, "loss": 15763150.0, "num_tokens": 17357709.0, "reward": 0.724609375, "reward_std": 0.1408689320087433, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.412109375, "rewards/tag_count_reward/std": 0.2231438010931015, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3453.0, "completions/mean_length": 2763.40625, "completions/mean_terminated_length": 2104.61962890625, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.030724588205171974, "frac_reward_zero_std": 0.65625, "grad_norm": 0.3163948059082031, "kl": 0.007244110107421875, "learning_rate": 5.98639455782313e-06, "loss": -0.0015, "num_tokens": 17731293.0, "reward": 0.8125, "reward_std": 0.2154465913772583, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.484375, "rewards/tag_count_reward/std": 0.25439056754112244, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3486.0, "completions/mean_length": 2967.421875, "completions/mean_terminated_length": 2223.27587890625, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 0.03140735683195357, "frac_reward_zero_std": 0.703125, "grad_norm": 47.566158294677734, "kl": 0.056438446044921875, "learning_rate": 6.122448979591837e-06, "loss": -0.0058, "num_tokens": 18132373.0, "reward": 0.70703125, "reward_std": 0.22097086906433105, "rewards/accuracy_reward/mean": 0.13709677755832672, "rewards/accuracy_reward/std": 0.34534481167793274, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.44140625, "rewards/tag_count_reward/std": 0.24399152398109436, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3296.0, "completions/mean_length": 2798.65625, "completions/mean_terminated_length": 1820.4210205078125, "completions/min_length": 655.0, "completions/min_terminated_length": 655.0, "epoch": 0.03209012545873517, "frac_reward_zero_std": 0.765625, "grad_norm": 0.20147721469402313, "kl": 0.008129119873046875, "learning_rate": 6.258503401360545e-06, "loss": 0.011, "num_tokens": 18510231.0, "reward": 0.587890625, "reward_std": 0.13534466922283173, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.416015625, "rewards/tag_count_reward/std": 0.22683490812778473, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3508.0, "completions/mean_length": 3021.15625, "completions/mean_terminated_length": 2171.37255859375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.03277289408551677, "frac_reward_zero_std": 0.75, "grad_norm": 0.19647663831710815, "kl": 0.00884246826171875, "learning_rate": 6.394557823129253e-06, "loss": -0.0008, "num_tokens": 18917145.0, "reward": 0.77734375, "reward_std": 0.19334951043128967, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.43359375, "rewards/tag_count_reward/std": 0.2633901834487915, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 2990.2109375, "completions/mean_terminated_length": 2032.8775634765625, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 0.03345566271229837, "frac_reward_zero_std": 0.703125, "grad_norm": 0.22713559865951538, "kl": 0.009754180908203125, "learning_rate": 6.530612244897959e-06, "loss": 0.0017, "num_tokens": 19320790.0, "reward": 0.693359375, "reward_std": 0.1905873715877533, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.474609375, "rewards/tag_count_reward/std": 0.3055262863636017, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3533.0, "completions/mean_length": 2979.9765625, "completions/mean_terminated_length": 1903.2391357421875, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 0.03413843133907997, "frac_reward_zero_std": 0.734375, "grad_norm": 0.23159106075763702, "kl": 0.010221481323242188, "learning_rate": 6.666666666666667e-06, "loss": -0.0048, "num_tokens": 19722759.0, "reward": 0.625, "reward_std": 0.1767766773700714, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.46875, "rewards/tag_count_reward/std": 0.30254822969436646, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3548.0, "completions/mean_length": 3074.109375, "completions/mean_terminated_length": 2133.64453125, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 0.03482119996586157, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2358776032924652, "kl": 0.01113128662109375, "learning_rate": 6.8027210884353745e-06, "loss": -0.0064, "num_tokens": 20136081.0, "reward": 0.646484375, "reward_std": 0.19611164927482605, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.490234375, "rewards/tag_count_reward/std": 0.3220907747745514, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3434.0, "completions/mean_length": 2661.328125, "completions/mean_terminated_length": 1920.591552734375, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.03550396859264317, "frac_reward_zero_std": 0.65625, "grad_norm": 0.25110501050949097, "kl": 0.011669158935546875, "learning_rate": 6.938775510204082e-06, "loss": 0.0017, "num_tokens": 20497903.0, "reward": 0.951171875, "reward_std": 0.29554852843284607, "rewards/accuracy_reward/mean": 0.17741934955120087, "rewards/accuracy_reward/std": 0.3835729956626892, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.607421875, "rewards/tag_count_reward/std": 0.35031425952911377, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3571.0, "completions/mean_length": 3005.2265625, "completions/mean_terminated_length": 1973.5, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "epoch": 0.03618673721942477, "frac_reward_zero_std": 0.671875, "grad_norm": 0.25683891773223877, "kl": 0.012226104736328125, "learning_rate": 7.07482993197279e-06, "loss": -0.0012, "num_tokens": 20903820.0, "reward": 0.740234375, "reward_std": 0.33421841263771057, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.505859375, "rewards/tag_count_reward/std": 0.33711495995521545, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3547.0, "completions/mean_length": 2862.234375, "completions/mean_terminated_length": 2018.1356201171875, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "epoch": 0.036869505846206366, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2364758849143982, "kl": 0.012935638427734375, "learning_rate": 7.210884353741497e-06, "loss": 0.0089, "num_tokens": 21291368.0, "reward": 0.845703125, "reward_std": 0.31212136149406433, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.595703125, "rewards/tag_count_reward/std": 0.3646823465824127, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.21875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3560.0, "completions/mean_length": 2790.09375, "completions/mean_terminated_length": 2152.732421875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.037552274472987965, "frac_reward_zero_std": 0.734375, "grad_norm": 0.22100533545017242, "kl": 0.014362335205078125, "learning_rate": 7.346938775510205e-06, "loss": 0.0167, "num_tokens": 21668116.0, "reward": 0.974609375, "reward_std": 0.27345144748687744, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.662109375, "rewards/tag_count_reward/std": 0.3639647364616394, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.46875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3522.0, "completions/mean_length": 2996.7734375, "completions/mean_terminated_length": 1984.74462890625, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.038235043099769564, "frac_reward_zero_std": 0.796875, "grad_norm": 0.2464330792427063, "kl": 0.013202667236328125, "learning_rate": 7.482993197278913e-06, "loss": 0.0063, "num_tokens": 22071957.0, "reward": 0.724609375, "reward_std": 0.14639319479465485, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.521484375, "rewards/tag_count_reward/std": 0.3493911325931549, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 2734.3046875, "completions/mean_terminated_length": 1801.03271484375, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.03891781172655116, "frac_reward_zero_std": 0.671875, "grad_norm": 0.2550375759601593, "kl": 0.01432037353515625, "learning_rate": 7.61904761904762e-06, "loss": 0.0186, "num_tokens": 22442366.0, "reward": 0.87890625, "reward_std": 0.24306795001029968, "rewards/accuracy_reward/mean": 0.1349206417798996, "rewards/accuracy_reward/std": 0.343002587556839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.3588852286338806, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3547.0, "completions/mean_length": 3006.5234375, "completions/mean_terminated_length": 2105.659912109375, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "epoch": 0.03960058035333276, "frac_reward_zero_std": 0.78125, "grad_norm": 0.1897994875907898, "kl": 0.013675689697265625, "learning_rate": 7.755102040816327e-06, "loss": 0.0081, "num_tokens": 22845781.0, "reward": 0.73828125, "reward_std": 0.1767767071723938, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.55078125, "rewards/tag_count_reward/std": 0.3636529743671417, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 2922.1640625, "completions/mean_terminated_length": 1889.699951171875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "epoch": 0.04028334898011436, "frac_reward_zero_std": 0.671875, "grad_norm": 0.9429160356521606, "kl": 0.01638031005859375, "learning_rate": 7.891156462585034e-06, "loss": 0.027, "num_tokens": 23240186.0, "reward": 0.76953125, "reward_std": 0.32593202590942383, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58203125, "rewards/tag_count_reward/std": 0.3605952262878418, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 2997.8984375, "completions/mean_terminated_length": 2168.509521484375, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.04096611760689596, "frac_reward_zero_std": 0.796875, "grad_norm": 0.18386708199977875, "kl": 0.015960693359375, "learning_rate": 8.027210884353741e-06, "loss": 0.0108, "num_tokens": 23644729.0, "reward": 0.826171875, "reward_std": 0.20716018974781036, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.576171875, "rewards/tag_count_reward/std": 0.362567663192749, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 3022.7734375, "completions/mean_terminated_length": 2175.431396484375, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "epoch": 0.04164888623367756, "frac_reward_zero_std": 0.796875, "grad_norm": 0.6832523941993713, "kl": 0.017551422119140625, "learning_rate": 8.16326530612245e-06, "loss": 0.0045, "num_tokens": 24051714.0, "reward": 0.751953125, "reward_std": 0.14639319479465485, "rewards/accuracy_reward/mean": 0.1031746044754982, "rewards/accuracy_reward/std": 0.30540111660957336, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.548828125, "rewards/tag_count_reward/std": 0.3632456958293915, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3490.0, "completions/mean_length": 2689.578125, "completions/mean_terminated_length": 1900.38232421875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "epoch": 0.04233165486045916, "frac_reward_zero_std": 0.71875, "grad_norm": 129.58432006835938, "kl": 0.0571136474609375, "learning_rate": 8.299319727891157e-06, "loss": 0.0098, "num_tokens": 24416086.0, "reward": 1.15625, "reward_std": 0.27068930864334106, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.671875, "rewards/tag_count_reward/std": 0.36553001403808594, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 2901.1875, "completions/mean_terminated_length": 1903.2308349609375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.043014423487240765, "frac_reward_zero_std": 0.78125, "grad_norm": 0.5222603678703308, "kl": 0.02112579345703125, "learning_rate": 8.435374149659866e-06, "loss": 0.0177, "num_tokens": 24809638.0, "reward": 0.806640625, "reward_std": 0.20716017484664917, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.556640625, "rewards/tag_count_reward/std": 0.3621007800102234, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3515.0, "completions/mean_length": 2822.15625, "completions/mean_terminated_length": 1778.148193359375, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.043697192114022364, "frac_reward_zero_std": 0.78125, "grad_norm": 0.21305792033672333, "kl": 0.02138519287109375, "learning_rate": 8.571428571428571e-06, "loss": 0.0102, "num_tokens": 25191138.0, "reward": 0.78515625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58203125, "rewards/tag_count_reward/std": 0.36869287490844727, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 2753.3046875, "completions/mean_terminated_length": 1896.2381591796875, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.04437996074080396, "frac_reward_zero_std": 0.828125, "grad_norm": 0.18671609461307526, "kl": 0.02201080322265625, "learning_rate": 8.70748299319728e-06, "loss": 0.0067, "num_tokens": 25564915.0, "reward": 1.013671875, "reward_std": 0.17401455342769623, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.623046875, "rewards/tag_count_reward/std": 0.3751588761806488, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3542.0, "completions/mean_length": 2972.21875, "completions/mean_terminated_length": 2048.549072265625, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 0.04506272936758556, "frac_reward_zero_std": 0.734375, "grad_norm": 0.22553564608097076, "kl": 0.02329254150390625, "learning_rate": 8.843537414965987e-06, "loss": 0.0182, "num_tokens": 25967493.0, "reward": 0.923828125, "reward_std": 0.24583008885383606, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.564453125, "rewards/tag_count_reward/std": 0.36349964141845703, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2866.7265625, "completions/mean_terminated_length": 2126.6826171875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.04574549799436716, "frac_reward_zero_std": 0.75, "grad_norm": 0.26017001271247864, "kl": 0.0234527587890625, "learning_rate": 8.979591836734695e-06, "loss": -0.0095, "num_tokens": 26354266.0, "reward": 0.9453125, "reward_std": 0.2541164755821228, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.36712533235549927, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3515.0, "completions/mean_length": 2819.7890625, "completions/mean_terminated_length": 1738.3585205078125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.04642826662114876, "frac_reward_zero_std": 0.75, "grad_norm": 0.21088667213916779, "kl": 0.02532196044921875, "learning_rate": 9.115646258503402e-06, "loss": 0.0135, "num_tokens": 26735703.0, "reward": 0.912109375, "reward_std": 0.20716017484664917, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.37416577339172363, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.583984375, "rewards/tag_count_reward/std": 0.3608243763446808, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.65625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3516.0, "completions/mean_length": 2978.6484375, "completions/mean_terminated_length": 2122.018798828125, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.04711103524793036, "frac_reward_zero_std": 0.6875, "grad_norm": 18157754.0, "kl": 292736.02210235596, "learning_rate": 9.251700680272109e-06, "loss": 11736.3936, "num_tokens": 27137920.0, "reward": 0.78125, "reward_std": 0.27068930864334106, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.578125, "rewards/tag_count_reward/std": 0.3628273606300354, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3547.0, "completions/mean_length": 2845.5703125, "completions/mean_terminated_length": 2034.508056640625, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 0.04779380387471196, "frac_reward_zero_std": 0.8125, "grad_norm": 0.20582886040210724, "kl": 0.02655792236328125, "learning_rate": 9.387755102040818e-06, "loss": 0.0103, "num_tokens": 27522535.0, "reward": 0.83203125, "reward_std": 0.08286407589912415, "rewards/accuracy_reward/mean": 0.1111111119389534, "rewards/accuracy_reward/std": 0.31552425026893616, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.61328125, "rewards/tag_count_reward/std": 0.3723454177379608, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3577.0, "completions/mean_length": 2851.765625, "completions/mean_terminated_length": 1968.034423828125, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "epoch": 0.04847657250149356, "frac_reward_zero_std": 0.78125, "grad_norm": 0.2288392335176468, "kl": 0.028717041015625, "learning_rate": 9.523809523809525e-06, "loss": 0.0201, "num_tokens": 27908937.0, "reward": 0.9140625, "reward_std": 0.13258251547813416, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6171875, "rewards/tag_count_reward/std": 0.36712533235549927, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3520.0, "completions/mean_length": 2899.375, "completions/mean_terminated_length": 2193.01611328125, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "epoch": 0.04915934112827516, "frac_reward_zero_std": 0.71875, "grad_norm": 0.2996331453323364, "kl": 0.033416748046875, "learning_rate": 9.659863945578232e-06, "loss": 0.0107, "num_tokens": 28300409.0, "reward": 1.015625, "reward_std": 0.23201939463615417, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.3668738901615143, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3397.0, "completions/mean_length": 2648.78125, "completions/mean_terminated_length": 1683.8731689453125, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.049842109755056756, "frac_reward_zero_std": 0.78125, "grad_norm": 0.22472000122070312, "kl": 0.0318450927734375, "learning_rate": 9.795918367346939e-06, "loss": 0.0045, "num_tokens": 28658793.0, "reward": 0.974609375, "reward_std": 0.2513543665409088, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.630859375, "rewards/tag_count_reward/std": 0.37248480319976807, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3274.0, "completions/mean_length": 2701.484375, "completions/mean_terminated_length": 1636.3792724609375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.050524878381838355, "frac_reward_zero_std": 0.828125, "grad_norm": 0.22298100590705872, "kl": 0.03203582763671875, "learning_rate": 9.931972789115647e-06, "loss": -0.003, "num_tokens": 29025597.0, "reward": 0.8515625, "reward_std": 0.14915534853935242, "rewards/accuracy_reward/mean": 0.1269841343164444, "rewards/accuracy_reward/std": 0.33428436517715454, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6015625, "rewards/tag_count_reward/std": 0.371787428855896, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5, "completions/max_length": 3584.0, "completions/max_terminated_length": 3559.0, "completions/mean_length": 3035.109375, "completions/mean_terminated_length": 2120.291748046875, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 0.051207647008619954, "frac_reward_zero_std": 0.859375, "grad_norm": 0.2316034436225891, "kl": 0.0307769775390625, "learning_rate": 1.0068027210884354e-05, "loss": 0.0024, "num_tokens": 29434529.0, "reward": 0.83203125, "reward_std": 0.08286407589912415, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53515625, "rewards/tag_count_reward/std": 0.36416009068489075, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3476.0, "completions/mean_length": 2981.8828125, "completions/mean_terminated_length": 1832.386474609375, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "epoch": 0.05189041563540155, "frac_reward_zero_std": 0.703125, "grad_norm": 0.22378158569335938, "kl": 0.03987884521484375, "learning_rate": 1.0204081632653063e-05, "loss": 0.0216, "num_tokens": 29838084.0, "reward": 0.7109375, "reward_std": 0.27068930864334106, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5390625, "rewards/tag_count_reward/std": 0.3555486798286438, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3520.0, "completions/mean_length": 2910.78125, "completions/mean_terminated_length": 1988.2222900390625, "completions/min_length": 627.0, "completions/min_terminated_length": 627.0, "epoch": 0.05257318426218315, "frac_reward_zero_std": 0.828125, "grad_norm": 0.19416174292564392, "kl": 0.03664398193359375, "learning_rate": 1.034013605442177e-05, "loss": 0.0143, "num_tokens": 30230534.0, "reward": 0.79296875, "reward_std": 0.17125241458415985, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.57421875, "rewards/tag_count_reward/std": 0.37035757303237915, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3565.0, "completions/mean_length": 2928.25, "completions/mean_terminated_length": 2057.890869140625, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "epoch": 0.05325595288896475, "frac_reward_zero_std": 0.703125, "grad_norm": 0.26288193464279175, "kl": 0.03601837158203125, "learning_rate": 1.0476190476190477e-05, "loss": 0.0086, "num_tokens": 30624170.0, "reward": 0.865234375, "reward_std": 0.24583008885383606, "rewards/accuracy_reward/mean": 0.1428571492433548, "rewards/accuracy_reward/std": 0.35132402181625366, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.583984375, "rewards/tag_count_reward/std": 0.3675805926322937, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3541.0, "completions/mean_length": 2806.3359375, "completions/mean_terminated_length": 1774.16357421875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.05393872151574635, "frac_reward_zero_std": 0.75, "grad_norm": 0.6135672926902771, "kl": 0.03632354736328125, "learning_rate": 1.0612244897959186e-05, "loss": 0.0285, "num_tokens": 31003449.0, "reward": 0.755859375, "reward_std": 0.21820871531963348, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.583984375, "rewards/tag_count_reward/std": 0.37024855613708496, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.71875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3414.0, "completions/mean_length": 2790.21875, "completions/mean_terminated_length": 1736.654541015625, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.05462149014252795, "frac_reward_zero_std": 0.796875, "grad_norm": 0.20150624215602875, "kl": 0.0357666015625, "learning_rate": 1.0748299319727893e-05, "loss": 0.0141, "num_tokens": 31381519.0, "reward": 0.96484375, "reward_std": 0.19334951043128967, "rewards/accuracy_reward/mean": 0.1984127014875412, "rewards/accuracy_reward/std": 0.4003966450691223, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.57421875, "rewards/tag_count_reward/std": 0.36769041419029236, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3581.0, "completions/mean_length": 2773.3515625, "completions/mean_terminated_length": 1936.9683837890625, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.05530425876930955, "frac_reward_zero_std": 0.75, "grad_norm": 0.4934340715408325, "kl": 0.0381622314453125, "learning_rate": 1.0884353741496601e-05, "loss": 0.0225, "num_tokens": 31757594.0, "reward": 1.029296875, "reward_std": 0.28449997305870056, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.638671875, "rewards/tag_count_reward/std": 0.3696250021457672, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3497.0, "completions/mean_length": 2843.8046875, "completions/mean_terminated_length": 1978.152587890625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.05598702739609115, "frac_reward_zero_std": 0.75, "grad_norm": 0.20095695555210114, "kl": 0.0362396240234375, "learning_rate": 1.1020408163265306e-05, "loss": 0.0142, "num_tokens": 32142189.0, "reward": 0.982421875, "reward_std": 0.1905873715877533, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.623046875, "rewards/tag_count_reward/std": 0.3645136058330536, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 2870.203125, "completions/mean_terminated_length": 2086.196533203125, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.05666979602287275, "frac_reward_zero_std": 0.78125, "grad_norm": 0.5752817988395691, "kl": 0.0417022705078125, "learning_rate": 1.1156462585034013e-05, "loss": 0.0081, "num_tokens": 32529979.0, "reward": 0.927734375, "reward_std": 0.21268445253372192, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.615234375, "rewards/tag_count_reward/std": 0.37107837200164795, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3565.0, "completions/mean_length": 2879.8515625, "completions/mean_terminated_length": 2130.274169921875, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 0.057352564649654346, "frac_reward_zero_std": 0.6875, "grad_norm": 0.27997422218322754, "kl": 0.0484619140625, "learning_rate": 1.1292517006802722e-05, "loss": 0.0007, "num_tokens": 32919492.0, "reward": 0.921875, "reward_std": 0.22097086906433105, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.640625, "rewards/tag_count_reward/std": 0.36418119072914124, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 3058.484375, "completions/mean_terminated_length": 2055.227294921875, "completions/min_length": 526.0, "completions/min_terminated_length": 526.0, "epoch": 0.058035333276435945, "frac_reward_zero_std": 0.8125, "grad_norm": 0.17069101333618164, "kl": 0.0434722900390625, "learning_rate": 1.1428571428571429e-05, "loss": 0.0086, "num_tokens": 33331288.0, "reward": 0.80078125, "reward_std": 0.16020387411117554, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.51953125, "rewards/tag_count_reward/std": 0.35716700553894043, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3436.0, "completions/mean_length": 2770.828125, "completions/mean_terminated_length": 1877.6719970703125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "epoch": 0.058718101903217544, "frac_reward_zero_std": 0.84375, "grad_norm": 0.22186261415481567, "kl": 0.0446319580078125, "learning_rate": 1.1564625850340136e-05, "loss": 0.0107, "num_tokens": 33707626.0, "reward": 0.978515625, "reward_std": 0.14639319479465485, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.619140625, "rewards/tag_count_reward/std": 0.37248480319976807, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.84375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 2854.7734375, "completions/mean_terminated_length": 2001.9490966796875, "completions/min_length": 415.0, "completions/min_terminated_length": 415.0, "epoch": 0.05940087052999914, "frac_reward_zero_std": 0.65625, "grad_norm": 0.24146223068237305, "kl": 0.0446929931640625, "learning_rate": 1.1700680272108845e-05, "loss": 0.0156, "num_tokens": 34093783.0, "reward": 0.947265625, "reward_std": 0.33421844244003296, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.37416577339172363, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.619140625, "rewards/tag_count_reward/std": 0.36849987506866455, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.75, "completions/max_length": 3584.0, "completions/max_terminated_length": 3451.0, "completions/mean_length": 2958.7890625, "completions/mean_terminated_length": 2154.946533203125, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 0.06008363915678075, "frac_reward_zero_std": 0.78125, "grad_norm": 0.20638814568519592, "kl": 0.047637939453125, "learning_rate": 1.1836734693877552e-05, "loss": 0.0105, "num_tokens": 34493480.0, "reward": 0.77734375, "reward_std": 0.18782523274421692, "rewards/accuracy_reward/mean": 0.095238097012043, "rewards/accuracy_reward/std": 0.29471534490585327, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58984375, "rewards/tag_count_reward/std": 0.37085554003715515, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3521.0, "completions/mean_length": 2870.28125, "completions/mean_terminated_length": 1792.7059326171875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.06076640778356235, "frac_reward_zero_std": 0.828125, "grad_norm": 0.18048152327537537, "kl": 0.0482940673828125, "learning_rate": 1.197278911564626e-05, "loss": 0.0111, "num_tokens": 34881732.0, "reward": 0.869140625, "reward_std": 0.19611163437366486, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.572265625, "rewards/tag_count_reward/std": 0.3647245168685913, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3527.0, "completions/mean_length": 2773.40625, "completions/mean_terminated_length": 1910.51611328125, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.06144917641034395, "frac_reward_zero_std": 0.78125, "grad_norm": 0.18028239905834198, "kl": 0.0487060546875, "learning_rate": 1.2108843537414967e-05, "loss": 0.0035, "num_tokens": 35258408.0, "reward": 0.8359375, "reward_std": 0.18230095505714417, "rewards/accuracy_reward/mean": 0.1031746044754982, "rewards/accuracy_reward/std": 0.30540111660957336, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6328125, "rewards/tag_count_reward/std": 0.3697965741157532, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.53125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3512.0, "completions/mean_length": 2887.8984375, "completions/mean_terminated_length": 1765.6121826171875, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "epoch": 0.06213194503712555, "frac_reward_zero_std": 0.796875, "grad_norm": 0.2542286217212677, "kl": 0.0486907958984375, "learning_rate": 1.2244897959183674e-05, "loss": 0.0046, "num_tokens": 35649313.0, "reward": 0.765625, "reward_std": 0.2154465913772583, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.53125, "rewards/tag_count_reward/std": 0.36044591665267944, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.40625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3513.0, "completions/mean_length": 3025.3125, "completions/mean_terminated_length": 1994.844482421875, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "epoch": 0.06281471366390715, "frac_reward_zero_std": 0.875, "grad_norm": 0.14247578382492065, "kl": 0.0504608154296875, "learning_rate": 1.2380952380952383e-05, "loss": 0.0109, "num_tokens": 36056875.0, "reward": 0.685546875, "reward_std": 0.09115047752857208, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.529296875, "rewards/tag_count_reward/std": 0.35855832695961, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.4375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3542.0, "completions/mean_length": 2972.015625, "completions/mean_terminated_length": 1881.0870361328125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.06349748229068874, "frac_reward_zero_std": 0.78125, "grad_norm": 22556.3828125, "kl": 95.30116271972656, "learning_rate": 1.251700680272109e-05, "loss": 3.8162, "num_tokens": 36458553.0, "reward": 0.8828125, "reward_std": 0.22097086906433105, "rewards/accuracy_reward/mean": 0.1746031790971756, "rewards/accuracy_reward/std": 0.38114282488822937, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5390625, "rewards/tag_count_reward/std": 0.35967710614204407, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.78125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 2926.9765625, "completions/mean_terminated_length": 2108.578857421875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.06418025091747034, "frac_reward_zero_std": 0.640625, "grad_norm": 0.25790271162986755, "kl": 0.0504608154296875, "learning_rate": 1.2653061224489798e-05, "loss": 0.0192, "num_tokens": 36853586.0, "reward": 0.912109375, "reward_std": 0.3065970540046692, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.599609375, "rewards/tag_count_reward/std": 0.3663232922554016, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3564.0, "completions/mean_length": 2876.5, "completions/mean_terminated_length": 1772.7999267578125, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.06486301954425194, "frac_reward_zero_std": 0.734375, "grad_norm": 0.25338295102119446, "kl": 0.05303955078125, "learning_rate": 1.2789115646258505e-05, "loss": 0.0319, "num_tokens": 37241684.0, "reward": 0.923828125, "reward_std": 0.2403058260679245, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.580078125, "rewards/tag_count_reward/std": 0.3589869737625122, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.5625, "completions/max_length": 3584.0, "completions/max_terminated_length": 3557.0, "completions/mean_length": 3066.59375, "completions/mean_terminated_length": 2259.43994140625, "completions/min_length": 599.0, "completions/min_terminated_length": 599.0, "epoch": 0.06554578817103354, "frac_reward_zero_std": 0.78125, "grad_norm": 0.19196166098117828, "kl": 0.053436279296875, "learning_rate": 1.2925170068027212e-05, "loss": 0.0084, "num_tokens": 37655090.0, "reward": 0.875, "reward_std": 0.22649513185024261, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.578125, "rewards/tag_count_reward/std": 0.35736072063446045, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.21875, "completions/max_length": 3584.0, "completions/max_terminated_length": 3568.0, "completions/mean_length": 3040.234375, "completions/mean_terminated_length": 1799.3333740234375, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "epoch": 0.06622855679781514, "frac_reward_zero_std": 0.796875, "grad_norm": 0.18921677768230438, "kl": 0.057464599609375, "learning_rate": 1.3061224489795918e-05, "loss": 0.0056, "num_tokens": 38065752.0, "reward": 0.76171875, "reward_std": 0.13258251547813416, "rewards/accuracy_reward/mean": 0.1349206417798996, "rewards/accuracy_reward/std": 0.343002587556839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.49609375, "rewards/tag_count_reward/std": 0.34077316522598267, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3539.0, "completions/mean_length": 3062.0078125, "completions/mean_terminated_length": 2273.902099609375, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 0.06691132542459674, "frac_reward_zero_std": 0.671875, "grad_norm": 0.23682913184165955, "kl": 0.0564117431640625, "learning_rate": 1.3197278911564626e-05, "loss": 0.0142, "num_tokens": 38477655.0, "reward": 0.75390625, "reward_std": 0.24859221279621124, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.56640625, "rewards/tag_count_reward/std": 0.3597412407398224, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.59375, "completions/max_length": 3584.0, "completions/max_terminated_length": 3553.0, "completions/mean_length": 3110.75, "completions/mean_terminated_length": 2396.2353515625, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 0.06759409405137834, "frac_reward_zero_std": 0.671875, "grad_norm": 0.20407989621162415, "kl": 0.060699462890625, "learning_rate": 1.3333333333333333e-05, "loss": 0.0061, "num_tokens": 38895425.0, "reward": 0.81640625, "reward_std": 0.2651650309562683, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.58203125, "rewards/tag_count_reward/std": 0.3605952262878418, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8125, "completions/max_length": 3584.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 2925.5546875, "completions/mean_terminated_length": 2130.87939453125, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.06827686267815994, "frac_reward_zero_std": 0.734375, "grad_norm": 0.19830115139484406, "kl": 0.058990478515625, "learning_rate": 1.3469387755102042e-05, "loss": 0.0122, "num_tokens": 39289304.0, "reward": 0.955078125, "reward_std": 0.1905873715877533, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.611328125, "rewards/tag_count_reward/std": 0.366952508687973, "step": 100 } ], "logging_steps": 1, "max_steps": 1465, "num_input_tokens_seen": 39289304, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }