{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998666311016271, "eval_steps": 1, "global_step": 234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 494.890625, "completions/mean_terminated_length": 482.6614074707031, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.004267804747932782, "grad_norm": 0.2570115373475802, "learning_rate": 0.0, "loss": -0.0035, "num_tokens": 162332.0, "reward": 0.095703125, "reward_std": 0.17584297060966492, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 534.98828125, "completions/mean_terminated_length": 523.0748291015625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.008535609495865563, "grad_norm": 0.24752090505199714, "learning_rate": 4.166666666666666e-08, "loss": -0.006, "num_tokens": 333433.0, "reward": 0.12109375, "reward_std": 0.20258089900016785, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08821486681699753, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1711.0, "completions/mean_length": 474.12109375, "completions/mean_terminated_length": 467.94903564453125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.012803414243798347, "grad_norm": 0.353285694760891, "learning_rate": 8.333333333333333e-08, "loss": -0.0132, "num_tokens": 489184.0, "reward": 0.1796875, "reward_std": 0.308138370513916, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38467901945114136, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1710.0, "completions/mean_length": 419.41015625, "completions/mean_terminated_length": 413.0235595703125, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.017071218991731127, "grad_norm": 0.3238140041719121, "learning_rate": 1.25e-07, "loss": -0.0107, "num_tokens": 629873.0, "reward": 0.130859375, "reward_std": 0.21213515102863312, "rewards/accuracy_reward/mean": 0.12890625, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 506.5234375, "completions/mean_terminated_length": 488.2450866699219, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.021339023739663912, "grad_norm": 0.3191673881250607, "learning_rate": 1.6666666666666665e-07, "loss": -0.0379, "num_tokens": 795031.0, "reward": 0.150390625, "reward_std": 0.2623838186264038, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 0.01171875, "rewards/format_reward/std": 0.1078278198838234, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 536.17578125, "completions/mean_terminated_length": 512.1785888671875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.025606828487596694, "grad_norm": 0.23250426520311288, "learning_rate": 2.0833333333333333e-07, "loss": -0.0285, "num_tokens": 967388.0, "reward": 0.11328125, "reward_std": 0.19621387124061584, "rewards/accuracy_reward/mean": 0.11328125, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 581.45703125, "completions/mean_terminated_length": 569.909423828125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.029874633235529476, "grad_norm": 0.2570695233430195, "learning_rate": 2.5e-07, "loss": -0.0245, "num_tokens": 1155489.0, "reward": 0.087890625, "reward_std": 0.1791757494211197, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1657.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 448.046875, "completions/mean_terminated_length": 448.046875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.034142437983462254, "grad_norm": 0.3667967373682546, "learning_rate": 2.916666666666667e-07, "loss": -0.0086, "num_tokens": 1306133.0, "reward": 0.125, "reward_std": 0.2576282024383545, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08821486681699753, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 448.9921875, "completions/mean_terminated_length": 436.4015808105469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.03841024273139504, "grad_norm": 0.41766001020055105, "learning_rate": 3.333333333333333e-07, "loss": -0.026, "num_tokens": 1456275.0, "reward": 0.171875, "reward_std": 0.27157536149024963, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 480.38671875, "completions/mean_terminated_length": 474.2392578125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.042678047479327824, "grad_norm": 0.32972526301694516, "learning_rate": 3.75e-07, "loss": -0.064, "num_tokens": 1610758.0, "reward": 0.14453125, "reward_std": 0.23395395278930664, "rewards/accuracy_reward/mean": 0.14453125, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 504.75, "completions/mean_terminated_length": 492.5984191894531, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.0469458522272606, "grad_norm": 0.2723514184754716, "learning_rate": 4.1666666666666667e-07, "loss": -0.0389, "num_tokens": 1775486.0, "reward": 0.123046875, "reward_std": 0.2098972499370575, "rewards/accuracy_reward/mean": 0.12109375, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 490.03515625, "completions/mean_terminated_length": 477.7677001953125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.05121365697519339, "grad_norm": 0.9124980782540163, "learning_rate": 4.5833333333333327e-07, "loss": -0.0169, "num_tokens": 1936959.0, "reward": 0.12890625, "reward_std": 0.21988865733146667, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08821486681699753, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 571.9609375, "completions/mean_terminated_length": 571.9609375, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.055481461723126166, "grad_norm": 0.27097928837090157, "learning_rate": 5e-07, "loss": -0.0412, "num_tokens": 2122461.0, "reward": 0.15625, "reward_std": 0.25407516956329346, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 543.5546875, "completions/mean_terminated_length": 531.7086791992188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.05974926647105895, "grad_norm": 0.2412268039467004, "learning_rate": 5.416666666666666e-07, "loss": 0.0021, "num_tokens": 2293243.0, "reward": 0.1328125, "reward_std": 0.20843946933746338, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 499.04296875, "completions/mean_terminated_length": 474.4563903808594, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.06401707121899174, "grad_norm": 0.27402845211677135, "learning_rate": 5.833333333333334e-07, "loss": -0.0334, "num_tokens": 2454966.0, "reward": 0.173828125, "reward_std": 0.24171365797519684, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 517.296875, "completions/mean_terminated_length": 486.8048095703125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.06828487596692451, "grad_norm": 0.27961598068647736, "learning_rate": 6.249999999999999e-07, "loss": -0.0055, "num_tokens": 2620058.0, "reward": 0.16015625, "reward_std": 0.2830517292022705, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 0.015625, "rewards/format_reward/std": 0.12426253408193588, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 512.2109375, "completions/mean_terminated_length": 506.1882629394531, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.07255268071485729, "grad_norm": 0.2719551051491964, "learning_rate": 6.666666666666666e-07, "loss": -0.0057, "num_tokens": 2796384.0, "reward": 0.12890625, "reward_std": 0.2379668951034546, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08821486681699753, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 531.515625, "completions/mean_terminated_length": 507.4444885253906, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.07682048546279008, "grad_norm": 0.29037656472769297, "learning_rate": 7.083333333333334e-07, "loss": -0.0302, "num_tokens": 2978284.0, "reward": 0.138671875, "reward_std": 0.25321292877197266, "rewards/accuracy_reward/mean": 0.13671875, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 506.19921875, "completions/mean_terminated_length": 494.0590515136719, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "epoch": 0.08108829021072286, "grad_norm": 0.2963880732497132, "learning_rate": 7.5e-07, "loss": -0.0176, "num_tokens": 3147335.0, "reward": 0.171875, "reward_std": 0.2536035180091858, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08821486681699753, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 622.21875, "completions/mean_terminated_length": 582.1365356445312, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.08535609495865565, "grad_norm": 0.23734206503775232, "learning_rate": 7.916666666666666e-07, "loss": -0.0059, "num_tokens": 3345599.0, "reward": 0.15625, "reward_std": 0.23474395275115967, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 569.54296875, "completions/mean_terminated_length": 557.9015502929688, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.08962389970658842, "grad_norm": 0.2636621963543347, "learning_rate": 8.333333333333333e-07, "loss": -0.0261, "num_tokens": 3539034.0, "reward": 0.16796875, "reward_std": 0.2069915533065796, "rewards/accuracy_reward/mean": 0.16796875, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 557.5078125, "completions/mean_terminated_length": 545.7716674804688, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "epoch": 0.0938917044545212, "grad_norm": 0.3118436045433341, "learning_rate": 8.75e-07, "loss": -0.0542, "num_tokens": 3716140.0, "reward": 0.197265625, "reward_std": 0.27905112504959106, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 566.75390625, "completions/mean_terminated_length": 537.2470092773438, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.09815950920245399, "grad_norm": 0.30754531099916416, "learning_rate": 9.166666666666665e-07, "loss": -0.0516, "num_tokens": 3901429.0, "reward": 0.154296875, "reward_std": 0.24952469766139984, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 519.75390625, "completions/mean_terminated_length": 513.7608032226562, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "epoch": 0.10242731395038678, "grad_norm": 0.32388668226405115, "learning_rate": 9.583333333333334e-07, "loss": -0.0207, "num_tokens": 4071686.0, "reward": 0.1796875, "reward_std": 0.30168429017066956, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.38467901945114136, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 632.21875, "completions/mean_terminated_length": 586.54833984375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "epoch": 0.10669511869831955, "grad_norm": 0.28009202798891364, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 4272478.0, "reward": 0.173828125, "reward_std": 0.2615548372268677, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 548.80859375, "completions/mean_terminated_length": 531.0316162109375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.11096292344625233, "grad_norm": 0.3052060877640385, "learning_rate": 9.999440509051367e-07, "loss": 0.0023, "num_tokens": 4445901.0, "reward": 0.3359375, "reward_std": 0.34599077701568604, "rewards/accuracy_reward/mean": 0.3359375, "rewards/accuracy_reward/std": 0.4732423722743988, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 547.10546875, "completions/mean_terminated_length": 535.2874145507812, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.11523072819418512, "grad_norm": 0.35684106787185593, "learning_rate": 9.997762161417517e-07, "loss": 0.0009, "num_tokens": 4622760.0, "reward": 0.4296875, "reward_std": 0.4141198396682739, "rewards/accuracy_reward/mean": 0.42578125, "rewards/accuracy_reward/std": 0.49542948603630066, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08821486681699753, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 582.15234375, "completions/mean_terminated_length": 570.6102294921875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.1194985329421179, "grad_norm": 0.314364820870329, "learning_rate": 9.994965332706572e-07, "loss": -0.0559, "num_tokens": 4807967.0, "reward": 0.396484375, "reward_std": 0.369541734457016, "rewards/accuracy_reward/mean": 0.39453125, "rewards/accuracy_reward/std": 0.48970720171928406, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 614.94921875, "completions/mean_terminated_length": 592.202392578125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.12376633769005067, "grad_norm": 47.01468065661083, "learning_rate": 9.991050648838675e-07, "loss": -0.049, "num_tokens": 5002658.0, "reward": 0.341796875, "reward_std": 0.33887720108032227, "rewards/accuracy_reward/mean": 0.33203125, "rewards/accuracy_reward/std": 0.4718646705150604, "rewards/format_reward/mean": 0.01953125, "rewards/format_reward/std": 0.13865381479263306, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 749.625, "completions/mean_terminated_length": 729.0159301757812, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "epoch": 0.12803414243798347, "grad_norm": 0.22791664567073833, "learning_rate": 9.986018985905899e-07, "loss": -0.0017, "num_tokens": 5230594.0, "reward": 0.24609375, "reward_std": 0.2706603407859802, "rewards/accuracy_reward/mean": 0.24609375, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 657.0546875, "completions/mean_terminated_length": 640.561279296875, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "epoch": 0.13230194718591626, "grad_norm": 0.2592014737956619, "learning_rate": 9.979871469976195e-07, "loss": -0.0089, "num_tokens": 5429960.0, "reward": 0.390625, "reward_std": 0.35836219787597656, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48884621262550354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 641.6796875, "completions/mean_terminated_length": 625.0039672851562, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.13656975193384902, "grad_norm": 0.29062097226457245, "learning_rate": 9.972609476841365e-07, "loss": -0.0154, "num_tokens": 5633382.0, "reward": 0.43359375, "reward_std": 0.35441678762435913, "rewards/accuracy_reward/mean": 0.43359375, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1923.0, "completions/mean_length": 752.05078125, "completions/mean_terminated_length": 720.9480590820312, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1408375566817818, "grad_norm": 0.2223733295468163, "learning_rate": 9.964234631709185e-07, "loss": -0.0186, "num_tokens": 5865835.0, "reward": 0.33203125, "reward_std": 0.3237749934196472, "rewards/accuracy_reward/mean": 0.33203125, "rewards/accuracy_reward/std": 0.4718646705150604, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 642.63671875, "completions/mean_terminated_length": 631.5708618164062, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.14510536142971459, "grad_norm": 0.28031006546472104, "learning_rate": 9.954748808839674e-07, "loss": 0.0198, "num_tokens": 6069782.0, "reward": 0.44140625, "reward_std": 0.34704914689064026, "rewards/accuracy_reward/mean": 0.44140625, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 678.16796875, "completions/mean_terminated_length": 656.4246215820312, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.14937316617764737, "grad_norm": 0.2753300220077088, "learning_rate": 9.944154131125642e-07, "loss": 0.0057, "num_tokens": 6277537.0, "reward": 0.435546875, "reward_std": 0.31180548667907715, "rewards/accuracy_reward/mean": 0.43359375, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 615.171875, "completions/mean_terminated_length": 592.4285888671875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.15364097092558016, "grad_norm": 0.2519379051726046, "learning_rate": 9.932452969617607e-07, "loss": 0.0259, "num_tokens": 6474301.0, "reward": 0.51953125, "reward_std": 0.3044525980949402, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 697.0859375, "completions/mean_terminated_length": 686.4487915039062, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.15790877567351294, "grad_norm": 0.23690228215779296, "learning_rate": 9.919647942993147e-07, "loss": 0.0077, "num_tokens": 6685995.0, "reward": 0.48046875, "reward_std": 0.2803860306739807, "rewards/accuracy_reward/mean": 0.48046875, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 761.08203125, "completions/mean_terminated_length": 724.903564453125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.16217658042144573, "grad_norm": 0.26189309931903004, "learning_rate": 9.905741916970863e-07, "loss": 0.0554, "num_tokens": 6917440.0, "reward": 0.5703125, "reward_std": 0.24211551249027252, "rewards/accuracy_reward/mean": 0.5703125, "rewards/accuracy_reward/std": 0.4960011839866638, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 718.17578125, "completions/mean_terminated_length": 686.260009765625, "completions/min_length": 8.0, "completions/min_terminated_length": 8.0, "epoch": 0.1664443851693785, "grad_norm": 0.22433030609891538, "learning_rate": 9.890738003669027e-07, "loss": 0.0011, "num_tokens": 7140173.0, "reward": 0.421875, "reward_std": 0.3233967423439026, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.49482619762420654, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 707.8203125, "completions/mean_terminated_length": 664.5886840820312, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.1707121899173113, "grad_norm": 0.24142502490016315, "learning_rate": 9.874639560909118e-07, "loss": 0.0948, "num_tokens": 7360927.0, "reward": 0.578125, "reward_std": 0.25620076060295105, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.49482619762420654, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 667.9375, "completions/mean_terminated_length": 617.65185546875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.17497999466524405, "grad_norm": 0.23034233566289028, "learning_rate": 9.857450191464337e-07, "loss": 0.0705, "num_tokens": 7565055.0, "reward": 0.56640625, "reward_std": 0.29234567284584045, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 808.91015625, "completions/mean_terminated_length": 763.7611694335938, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.17924779941317684, "grad_norm": 0.15288104504565547, "learning_rate": 9.839173742253334e-07, "loss": 0.0067, "num_tokens": 7812320.0, "reward": 0.453125, "reward_std": 0.2123872935771942, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.4987730085849762, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 757.765625, "completions/mean_terminated_length": 710.7530517578125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.18351560416110962, "grad_norm": 0.21817363062261033, "learning_rate": 9.819814303479267e-07, "loss": 0.0301, "num_tokens": 8044124.0, "reward": 0.51953125, "reward_std": 0.2767027020454407, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1877.0, "completions/mean_length": 696.4609375, "completions/mean_terminated_length": 618.272705078125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.1877834089090424, "grad_norm": 0.23513888129908564, "learning_rate": 9.799376207714444e-07, "loss": 0.1136, "num_tokens": 8258978.0, "reward": 0.64453125, "reward_std": 0.2685386538505554, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 823.21484375, "completions/mean_terminated_length": 783.7056274414062, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.1920512136569752, "grad_norm": 0.15675122066313374, "learning_rate": 9.777864028930705e-07, "loss": 0.0199, "num_tokens": 8509073.0, "reward": 0.44921875, "reward_std": 0.247502401471138, "rewards/accuracy_reward/mean": 0.44921875, "rewards/accuracy_reward/std": 0.49838894605636597, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 788.9765625, "completions/mean_terminated_length": 748.3628540039062, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.19631901840490798, "grad_norm": 0.20454769015675417, "learning_rate": 9.755282581475767e-07, "loss": 0.038, "num_tokens": 8753347.0, "reward": 0.59765625, "reward_std": 0.32180553674697876, "rewards/accuracy_reward/mean": 0.59765625, "rewards/accuracy_reward/std": 0.4913311004638672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 688.484375, "completions/mean_terminated_length": 661.4024047851562, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.20058682315284077, "grad_norm": 0.2436406130056707, "learning_rate": 9.73163691899582e-07, "loss": 0.0891, "num_tokens": 8961887.0, "reward": 0.59765625, "reward_std": 0.3268141746520996, "rewards/accuracy_reward/mean": 0.59765625, "rewards/accuracy_reward/std": 0.4913311004638672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 636.9921875, "completions/mean_terminated_length": 636.9921875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.20485462790077355, "grad_norm": 0.19700830452823495, "learning_rate": 9.706932333304517e-07, "loss": 0.0074, "num_tokens": 9161397.0, "reward": 0.5390625, "reward_std": 0.20897233486175537, "rewards/accuracy_reward/mean": 0.5390625, "rewards/accuracy_reward/std": 0.4994482398033142, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 730.828125, "completions/mean_terminated_length": 720.4566650390625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.2091224326487063, "grad_norm": 0.1802054848104146, "learning_rate": 9.681174353198686e-07, "loss": 0.0045, "num_tokens": 9386945.0, "reward": 0.56640625, "reward_std": 0.25209930539131165, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 756.77734375, "completions/mean_terminated_length": 704.28857421875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.2133902373966391, "grad_norm": 0.20316168234356663, "learning_rate": 9.65436874322102e-07, "loss": 0.0501, "num_tokens": 9618008.0, "reward": 0.53125, "reward_std": 0.2852449417114258, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1880.0, "completions/mean_length": 612.92578125, "completions/mean_terminated_length": 595.9091186523438, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 0.21765804214457188, "grad_norm": 0.48416880632938786, "learning_rate": 9.626521502369983e-07, "loss": 0.0519, "num_tokens": 9805733.0, "reward": 0.6953125, "reward_std": 0.28801077604293823, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 789.7734375, "completions/mean_terminated_length": 749.1854858398438, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.22192584689250466, "grad_norm": 0.19757693281422897, "learning_rate": 9.597638862757253e-07, "loss": 0.0277, "num_tokens": 10046019.0, "reward": 0.48828125, "reward_std": 0.18937908113002777, "rewards/accuracy_reward/mean": 0.48828125, "rewards/accuracy_reward/std": 0.5008418560028076, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 758.91015625, "completions/mean_terminated_length": 717.3265991210938, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.22619365164043745, "grad_norm": 0.2070702495543546, "learning_rate": 9.567727288213004e-07, "loss": 0.0079, "num_tokens": 10275228.0, "reward": 0.578125, "reward_std": 0.29420530796051025, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.49482619762420654, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 682.43359375, "completions/mean_terminated_length": 638.383056640625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.23046145638837023, "grad_norm": 0.20865981520254198, "learning_rate": 9.536793472839324e-07, "loss": 0.0807, "num_tokens": 10485083.0, "reward": 0.66015625, "reward_std": 0.2412043958902359, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.47458380460739136, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 789.47265625, "completions/mean_terminated_length": 764.4024047851562, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.23472926113630302, "grad_norm": 0.24817725107971106, "learning_rate": 9.504844339512094e-07, "loss": 0.0141, "num_tokens": 10725292.0, "reward": 0.4765625, "reward_std": 0.22461043298244476, "rewards/accuracy_reward/mean": 0.4765625, "rewards/accuracy_reward/std": 0.5004287362098694, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 790.01171875, "completions/mean_terminated_length": 738.8739624023438, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "epoch": 0.2389970658842358, "grad_norm": 0.1380585420387336, "learning_rate": 9.471887038331684e-07, "loss": 0.0103, "num_tokens": 10969431.0, "reward": 0.5078125, "reward_std": 0.21884137392044067, "rewards/accuracy_reward/mean": 0.5078125, "rewards/accuracy_reward/std": 0.5009182691574097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 777.04296875, "completions/mean_terminated_length": 746.5400390625, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.2432648706321686, "grad_norm": 0.17843550897472416, "learning_rate": 9.43792894502277e-07, "loss": 0.0051, "num_tokens": 11202554.0, "reward": 0.5, "reward_std": 0.28064805269241333, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5009794235229492, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 738.20703125, "completions/mean_terminated_length": 701.385498046875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.24753267538010135, "grad_norm": 0.19995757252218357, "learning_rate": 9.402977659283689e-07, "loss": 0.0315, "num_tokens": 11427103.0, "reward": 0.5078125, "reward_std": 0.24606332182884216, "rewards/accuracy_reward/mean": 0.5078125, "rewards/accuracy_reward/std": 0.5009182691574097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 704.59375, "completions/mean_terminated_length": 672.35205078125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.25180048012803413, "grad_norm": 0.20212521010131737, "learning_rate": 9.367041003085648e-07, "loss": 0.005, "num_tokens": 11646887.0, "reward": 0.6015625, "reward_std": 0.29538238048553467, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 722.8828125, "completions/mean_terminated_length": 674.5991821289062, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.25606828487596694, "grad_norm": 0.1889634470793339, "learning_rate": 9.330127018922193e-07, "loss": 0.0282, "num_tokens": 11871153.0, "reward": 0.51953125, "reward_std": 0.26025843620300293, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 688.640625, "completions/mean_terminated_length": 661.561767578125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.2603360896238997, "grad_norm": 0.16527044482740116, "learning_rate": 9.29224396800933e-07, "loss": 0.0011, "num_tokens": 12085317.0, "reward": 0.59765625, "reward_std": 0.24302664399147034, "rewards/accuracy_reward/mean": 0.59765625, "rewards/accuracy_reward/std": 0.4913311004638672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 739.89453125, "completions/mean_terminated_length": 703.1204833984375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.2646038943718325, "grad_norm": 0.1817045096138573, "learning_rate": 9.253400328436698e-07, "loss": 0.0054, "num_tokens": 12313746.0, "reward": 0.5625, "reward_std": 0.28380340337753296, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49705013632774353, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 701.35546875, "completions/mean_terminated_length": 679.980224609375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.2688716991197653, "grad_norm": 0.1839180488541761, "learning_rate": 9.213604793270196e-07, "loss": 0.0088, "num_tokens": 12524053.0, "reward": 0.6015625, "reward_std": 0.2809164524078369, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 719.71484375, "completions/mean_terminated_length": 693.2550048828125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.27313950386769803, "grad_norm": 0.1587390985090563, "learning_rate": 9.172866268606513e-07, "loss": 0.0182, "num_tokens": 12743868.0, "reward": 0.578125, "reward_std": 0.17833054065704346, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.49482619762420654, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 676.328125, "completions/mean_terminated_length": 649.0040283203125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.27740730861563084, "grad_norm": 0.16100932533778303, "learning_rate": 9.131193871579974e-07, "loss": 0.0195, "num_tokens": 12951496.0, "reward": 0.6640625, "reward_std": 0.2225247025489807, "rewards/accuracy_reward/mean": 0.6640625, "rewards/accuracy_reward/std": 0.4732423722743988, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 701.2421875, "completions/mean_terminated_length": 674.4143676757812, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.2816751133635636, "grad_norm": 0.46716467212075274, "learning_rate": 9.088596928322157e-07, "loss": 0.0134, "num_tokens": 13165550.0, "reward": 0.62109375, "reward_std": 0.21109546720981598, "rewards/accuracy_reward/mean": 0.62109375, "rewards/accuracy_reward/std": 0.4860650300979614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 842.6328125, "completions/mean_terminated_length": 793.6340942382812, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.2859429181114964, "grad_norm": 0.16388899852140715, "learning_rate": 9.045084971874737e-07, "loss": 0.0186, "num_tokens": 13415840.0, "reward": 0.55859375, "reward_std": 0.29036736488342285, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 677.08984375, "completions/mean_terminated_length": 660.8340454101562, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.29021072285942917, "grad_norm": 0.2061346542221812, "learning_rate": 9.000667740056032e-07, "loss": 0.0402, "num_tokens": 13622999.0, "reward": 0.671875, "reward_std": 0.30510413646698, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.47045037150382996, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 772.22265625, "completions/mean_terminated_length": 751.9722900390625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.294478527607362, "grad_norm": 0.16099414892174813, "learning_rate": 8.955355173281707e-07, "loss": 0.0091, "num_tokens": 13854584.0, "reward": 0.53125, "reward_std": 0.21226614713668823, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 664.7421875, "completions/mean_terminated_length": 637.187255859375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.29874633235529474, "grad_norm": 0.17624371113200196, "learning_rate": 8.909157412340149e-07, "loss": 0.0376, "num_tokens": 14064302.0, "reward": 0.59375, "reward_std": 0.17833054065704346, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49209436774253845, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 652.93359375, "completions/mean_terminated_length": 647.4627685546875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.30301413710322755, "grad_norm": 0.16045190404205056, "learning_rate": 8.862084796122997e-07, "loss": 0.0079, "num_tokens": 14270813.0, "reward": 0.62109375, "reward_std": 0.2211979478597641, "rewards/accuracy_reward/mean": 0.62109375, "rewards/accuracy_reward/std": 0.4860650300979614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 688.3203125, "completions/mean_terminated_length": 650.0963745117188, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3072819418511603, "grad_norm": 0.1887414611146376, "learning_rate": 8.814147859311332e-07, "loss": 0.0354, "num_tokens": 14480903.0, "reward": 0.65625, "reward_std": 0.20672954618930817, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47588926553726196, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 682.43359375, "completions/mean_terminated_length": 644.0441284179688, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.31154974659909307, "grad_norm": 0.1906200755581348, "learning_rate": 8.765357330018055e-07, "loss": 0.0125, "num_tokens": 14692814.0, "reward": 0.59765625, "reward_std": 0.27275487780570984, "rewards/accuracy_reward/mean": 0.59765625, "rewards/accuracy_reward/std": 0.4913311004638672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 807.25390625, "completions/mean_terminated_length": 772.3734741210938, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.3158175513470259, "grad_norm": 0.1883899046813179, "learning_rate": 8.71572412738697e-07, "loss": 0.0295, "num_tokens": 14933559.0, "reward": 0.5703125, "reward_std": 0.28209349513053894, "rewards/accuracy_reward/mean": 0.5703125, "rewards/accuracy_reward/std": 0.4960011839866638, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 674.0078125, "completions/mean_terminated_length": 641.0320434570312, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.32008535609495864, "grad_norm": 0.21767892682202633, "learning_rate": 8.66525935914913e-07, "loss": 0.0231, "num_tokens": 15138305.0, "reward": 0.609375, "reward_std": 0.2489503175020218, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48884621262550354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 650.12890625, "completions/mean_terminated_length": 622.2828979492188, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.32435316084289145, "grad_norm": 0.19564087736557972, "learning_rate": 8.613974319136957e-07, "loss": 0.0185, "num_tokens": 15338842.0, "reward": 0.6640625, "reward_std": 0.22423464059829712, "rewards/accuracy_reward/mean": 0.6640625, "rewards/accuracy_reward/std": 0.4732423722743988, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 730.609375, "completions/mean_terminated_length": 677.056884765625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3286209655908242, "grad_norm": 0.17202197261802565, "learning_rate": 8.561880484756724e-07, "loss": 0.0369, "num_tokens": 15566942.0, "reward": 0.5078125, "reward_std": 0.2588193416595459, "rewards/accuracy_reward/mean": 0.5078125, "rewards/accuracy_reward/std": 0.5009182691574097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 748.05078125, "completions/mean_terminated_length": 695.207275390625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.332888770338757, "grad_norm": 0.19789651992275556, "learning_rate": 8.508989514419958e-07, "loss": 0.0341, "num_tokens": 15796491.0, "reward": 0.5546875, "reward_std": 0.29169410467147827, "rewards/accuracy_reward/mean": 0.5546875, "rewards/accuracy_reward/std": 0.49797385931015015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 781.20703125, "completions/mean_terminated_length": 766.185791015625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.3371565750866898, "grad_norm": 0.18106530098095389, "learning_rate": 8.455313244934324e-07, "loss": 0.002, "num_tokens": 16034232.0, "reward": 0.55859375, "reward_std": 0.2968214750289917, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 592.90625, "completions/mean_terminated_length": 569.8095703125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.3414243798346226, "grad_norm": 0.19156344376777484, "learning_rate": 8.400863688854596e-07, "loss": 0.0214, "num_tokens": 16218288.0, "reward": 0.71875, "reward_std": 0.2315973937511444, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 712.8125, "completions/mean_terminated_length": 702.2991943359375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.34569218458255535, "grad_norm": 0.16774458832209882, "learning_rate": 8.34565303179429e-07, "loss": 0.008, "num_tokens": 16433688.0, "reward": 0.734375, "reward_std": 0.21397608518600464, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 732.46484375, "completions/mean_terminated_length": 706.2589721679688, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.3499599893304881, "grad_norm": 0.1747412031038752, "learning_rate": 8.289693629698563e-07, "loss": 0.0117, "num_tokens": 16654767.0, "reward": 0.6328125, "reward_std": 0.25316160917282104, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 788.609375, "completions/mean_terminated_length": 753.2047729492188, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.3542277940784209, "grad_norm": 0.15522696483921386, "learning_rate": 8.232998006078997e-07, "loss": 0.0202, "num_tokens": 16894779.0, "reward": 0.55078125, "reward_std": 0.22396378219127655, "rewards/accuracy_reward/mean": 0.55078125, "rewards/accuracy_reward/std": 0.49838894605636597, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 616.984375, "completions/mean_terminated_length": 616.984375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.3584955988263537, "grad_norm": 0.17841302636273246, "learning_rate": 8.175578849210894e-07, "loss": 0.0247, "num_tokens": 17084591.0, "reward": 0.69140625, "reward_std": 0.22423216700553894, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 627.47265625, "completions/mean_terminated_length": 610.6284790039062, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.3627634035742865, "grad_norm": 0.17841638110639277, "learning_rate": 8.117449009293668e-07, "loss": 0.0174, "num_tokens": 17278168.0, "reward": 0.6328125, "reward_std": 0.2315973937511444, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 642.0859375, "completions/mean_terminated_length": 619.7698974609375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.36703120832221925, "grad_norm": 0.16779939521060505, "learning_rate": 8.058621495575031e-07, "loss": 0.0275, "num_tokens": 17482222.0, "reward": 0.69921875, "reward_std": 0.229889914393425, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 696.1015625, "completions/mean_terminated_length": 680.0711669921875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.37129901307015206, "grad_norm": 0.15547052401529285, "learning_rate": 7.999109473439569e-07, "loss": -0.0119, "num_tokens": 17699168.0, "reward": 0.57421875, "reward_std": 0.22594210505485535, "rewards/accuracy_reward/mean": 0.57421875, "rewards/accuracy_reward/std": 0.49542948603630066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1843.0, "completions/mean_length": 672.66796875, "completions/mean_terminated_length": 634.0039672851562, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3755668178180848, "grad_norm": 0.2075832015926432, "learning_rate": 7.938926261462365e-07, "loss": 0.0422, "num_tokens": 17906067.0, "reward": 0.6640625, "reward_std": 0.2511882185935974, "rewards/accuracy_reward/mean": 0.6640625, "rewards/accuracy_reward/std": 0.4732423722743988, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 706.59375, "completions/mean_terminated_length": 674.4000244140625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.37983462256601763, "grad_norm": 0.17450668008186176, "learning_rate": 7.878085328428368e-07, "loss": 0.0417, "num_tokens": 18123947.0, "reward": 0.70703125, "reward_std": 0.20581842958927155, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 615.2890625, "completions/mean_terminated_length": 615.2890625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3841024273139504, "grad_norm": 0.29433828050165617, "learning_rate": 7.81660029031811e-07, "loss": 0.0251, "num_tokens": 18313293.0, "reward": 0.703125, "reward_std": 0.2618584930896759, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.45777595043182373, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1871.0, "completions/mean_length": 674.09375, "completions/mean_terminated_length": 641.1200561523438, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.38837023206188315, "grad_norm": 0.1524263582318208, "learning_rate": 7.754484907260512e-07, "loss": 0.0346, "num_tokens": 18520901.0, "reward": 0.671875, "reward_std": 0.14203590154647827, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.47045037150382996, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 868.55859375, "completions/mean_terminated_length": 825.5830078125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.39263803680981596, "grad_norm": 0.16545970246243014, "learning_rate": 7.691753080453411e-07, "loss": 0.0313, "num_tokens": 18778724.0, "reward": 0.484375, "reward_std": 0.25209686160087585, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5007347464561462, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 677.42578125, "completions/mean_terminated_length": 672.051025390625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.3969058415577487, "grad_norm": 0.2318783127386051, "learning_rate": 7.628418849052523e-07, "loss": 0.0072, "num_tokens": 18993529.0, "reward": 0.62109375, "reward_std": 0.2673616111278534, "rewards/accuracy_reward/mean": 0.62109375, "rewards/accuracy_reward/std": 0.4860650300979614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 761.3125, "completions/mean_terminated_length": 756.2667236328125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.40117364630568153, "grad_norm": 0.17729523898962476, "learning_rate": 7.564496387029531e-07, "loss": 0.0183, "num_tokens": 19224033.0, "reward": 0.5390625, "reward_std": 0.22370177507400513, "rewards/accuracy_reward/mean": 0.5390625, "rewards/accuracy_reward/std": 0.4994482398033142, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 733.1484375, "completions/mean_terminated_length": 706.9561767578125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4054414510536143, "grad_norm": 0.17772837137802816, "learning_rate": 7.5e-07, "loss": 0.006, "num_tokens": 19447919.0, "reward": 0.5703125, "reward_std": 0.22396133840084076, "rewards/accuracy_reward/mean": 0.5703125, "rewards/accuracy_reward/std": 0.4960011839866638, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 630.4609375, "completions/mean_terminated_length": 590.6104125976562, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.4097092558015471, "grad_norm": 0.1765422148392705, "learning_rate": 7.434944122021836e-07, "loss": 0.0296, "num_tokens": 19646709.0, "reward": 0.67578125, "reward_std": 0.2021351009607315, "rewards/accuracy_reward/mean": 0.67578125, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 709.61328125, "completions/mean_terminated_length": 699.0748291015625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.41397706054947986, "grad_norm": 1.043131061244302, "learning_rate": 7.369343312364993e-07, "loss": 0.025, "num_tokens": 19864034.0, "reward": 0.66015625, "reward_std": 0.19712254405021667, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.47458380460739136, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 584.66796875, "completions/mean_terminated_length": 555.5179443359375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.4182448652974126, "grad_norm": 0.20960134937787364, "learning_rate": 7.303212252253161e-07, "loss": 0.0423, "num_tokens": 20045565.0, "reward": 0.7265625, "reward_std": 0.1910865604877472, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.446596622467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 804.58203125, "completions/mean_terminated_length": 759.2753295898438, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.42251267004534543, "grad_norm": 0.18572317777841613, "learning_rate": 7.236565741578162e-07, "loss": 0.0184, "num_tokens": 20290610.0, "reward": 0.46484375, "reward_std": 0.2952326536178589, "rewards/accuracy_reward/mean": 0.46484375, "rewards/accuracy_reward/std": 0.49973952770233154, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 713.328125, "completions/mean_terminated_length": 713.328125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4267804747932782, "grad_norm": 0.21936154622420784, "learning_rate": 7.16941869558779e-07, "loss": 0.029, "num_tokens": 20509950.0, "reward": 0.62890625, "reward_std": 0.30904948711395264, "rewards/accuracy_reward/mean": 0.62890625, "rewards/accuracy_reward/std": 0.48404383659362793, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 722.703125, "completions/mean_terminated_length": 717.5059204101562, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.431048279541211, "grad_norm": 0.15759949457471728, "learning_rate": 7.101786141547828e-07, "loss": 0.0209, "num_tokens": 20743090.0, "reward": 0.6015625, "reward_std": 0.2474999576807022, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 703.4140625, "completions/mean_terminated_length": 660.040283203125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.43531608428914376, "grad_norm": 0.20631003127294775, "learning_rate": 7.033683215379002e-07, "loss": 0.0208, "num_tokens": 20962836.0, "reward": 0.61328125, "reward_std": 0.24777080118656158, "rewards/accuracy_reward/mean": 0.61328125, "rewards/accuracy_reward/std": 0.4879522919654846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 676.51953125, "completions/mean_terminated_length": 660.2569580078125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.43958388903707657, "grad_norm": 0.18277993766193618, "learning_rate": 6.965125158269618e-07, "loss": 0.0165, "num_tokens": 21181369.0, "reward": 0.6015625, "reward_std": 0.21686306595802307, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 749.9375, "completions/mean_terminated_length": 734.5454711914062, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.4438516937850093, "grad_norm": 0.14768169324356623, "learning_rate": 6.896127313264642e-07, "loss": 0.0037, "num_tokens": 21407257.0, "reward": 0.6328125, "reward_std": 0.17267769575119019, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 730.5625, "completions/mean_terminated_length": 714.9407348632812, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.44811949853294214, "grad_norm": 0.1714711197886987, "learning_rate": 6.826705121831976e-07, "loss": 0.0171, "num_tokens": 21628641.0, "reward": 0.6953125, "reward_std": 0.191619411110878, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 670.27734375, "completions/mean_terminated_length": 648.4087524414062, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.4523873032808749, "grad_norm": 0.17919870616850794, "learning_rate": 6.756874120406714e-07, "loss": 0.0125, "num_tokens": 21841048.0, "reward": 0.56640625, "reward_std": 0.16978827118873596, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 728.2578125, "completions/mean_terminated_length": 707.3095703125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.45665510802880765, "grad_norm": 0.1451932728689885, "learning_rate": 6.68664993691415e-07, "loss": 0.0032, "num_tokens": 22064930.0, "reward": 0.65234375, "reward_std": 0.17293481528759003, "rewards/accuracy_reward/mean": 0.65234375, "rewards/accuracy_reward/std": 0.4771590530872345, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 761.76953125, "completions/mean_terminated_length": 720.2781982421875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.46092291277674047, "grad_norm": 0.15406274620167626, "learning_rate": 6.6160482872723e-07, "loss": 0.0406, "num_tokens": 22291983.0, "reward": 0.5703125, "reward_std": 0.2566937804222107, "rewards/accuracy_reward/mean": 0.5703125, "rewards/accuracy_reward/std": 0.4960011839866638, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 666.41015625, "completions/mean_terminated_length": 633.2520141601562, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4651907175246732, "grad_norm": 0.18595861629610597, "learning_rate": 6.545084971874736e-07, "loss": 0.0125, "num_tokens": 22504056.0, "reward": 0.5390625, "reward_std": 0.2210792601108551, "rewards/accuracy_reward/mean": 0.5390625, "rewards/accuracy_reward/std": 0.4994482398033142, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 599.96484375, "completions/mean_terminated_length": 588.56298828125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.46945852227260604, "grad_norm": 0.16488321774504297, "learning_rate": 6.473775872054521e-07, "loss": 0.0011, "num_tokens": 22688183.0, "reward": 0.7265625, "reward_std": 0.20582087337970734, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.446596622467041, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 661.24609375, "completions/mean_terminated_length": 644.8023681640625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.4737263270205388, "grad_norm": 0.19850765821238375, "learning_rate": 6.402136946530014e-07, "loss": 0.018, "num_tokens": 22888486.0, "reward": 0.60546875, "reward_std": 0.2784126102924347, "rewards/accuracy_reward/mean": 0.60546875, "rewards/accuracy_reward/std": 0.48970720171928406, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 667.91015625, "completions/mean_terminated_length": 629.1124267578125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.4779941317684716, "grad_norm": 0.20616587813684115, "learning_rate": 6.330184227833375e-07, "loss": 0.0412, "num_tokens": 23098319.0, "reward": 0.6953125, "reward_std": 0.29314449429512024, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 831.84765625, "completions/mean_terminated_length": 802.6600341796875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.48226193651640437, "grad_norm": 0.17727191227994013, "learning_rate": 6.257933818722542e-07, "loss": 0.0011, "num_tokens": 23354552.0, "reward": 0.51953125, "reward_std": 0.2792089581489563, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 626.0, "completions/mean_terminated_length": 591.8720092773438, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.4865297412643372, "grad_norm": 0.2117873494491709, "learning_rate": 6.185401888577487e-07, "loss": 0.0277, "num_tokens": 23548552.0, "reward": 0.6171875, "reward_std": 0.2188364714384079, "rewards/accuracy_reward/mean": 0.6171875, "rewards/accuracy_reward/std": 0.48702529072761536, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 807.8828125, "completions/mean_terminated_length": 762.6963500976562, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.49079754601226994, "grad_norm": 0.1378906526266812, "learning_rate": 6.112604669781572e-07, "loss": 0.0253, "num_tokens": 23800194.0, "reward": 0.6796875, "reward_std": 0.2197500467300415, "rewards/accuracy_reward/mean": 0.6796875, "rewards/accuracy_reward/std": 0.4675106406211853, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1927.0, "completions/mean_length": 755.46875, "completions/mean_terminated_length": 740.142333984375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.4950653507602027, "grad_norm": 0.17213466615909223, "learning_rate": 6.039558454088795e-07, "loss": 0.0048, "num_tokens": 24030330.0, "reward": 0.63671875, "reward_std": 0.22305512428283691, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.48188701272010803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 656.859375, "completions/mean_terminated_length": 623.4720458984375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.4993331555081355, "grad_norm": 0.1892514134210408, "learning_rate": 5.966279588977766e-07, "loss": 0.0249, "num_tokens": 24234654.0, "reward": 0.640625, "reward_std": 0.2301558405160904, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 691.0859375, "completions/mean_terminated_length": 680.4015502929688, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5036009602560683, "grad_norm": 0.17532242354264443, "learning_rate": 5.892784473993183e-07, "loss": 0.0122, "num_tokens": 24449412.0, "reward": 0.58984375, "reward_std": 0.21436314284801483, "rewards/accuracy_reward/mean": 0.58984375, "rewards/accuracy_reward/std": 0.49282538890838623, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 727.23828125, "completions/mean_terminated_length": 716.8385620117188, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.507868765004001, "grad_norm": 0.15731128075810266, "learning_rate": 5.819089557075688e-07, "loss": 0.0053, "num_tokens": 24674217.0, "reward": 0.5859375, "reward_std": 0.23079612851142883, "rewards/accuracy_reward/mean": 0.5859375, "rewards/accuracy_reward/std": 0.4935242533683777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1992.0, "completions/mean_length": 675.8671875, "completions/mean_terminated_length": 654.0873413085938, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5121365697519339, "grad_norm": 0.16762770464548887, "learning_rate": 5.745211330880872e-07, "loss": 0.0053, "num_tokens": 24883719.0, "reward": 0.6015625, "reward_std": 0.19438527524471283, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 688.73828125, "completions/mean_terminated_length": 678.035400390625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.5164043744998666, "grad_norm": 0.14032321277110027, "learning_rate": 5.671166329088277e-07, "loss": 0.0173, "num_tokens": 25092724.0, "reward": 0.6015625, "reward_std": 0.17833054065704346, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1918.0, "completions/mean_length": 623.93359375, "completions/mean_terminated_length": 607.0474853515625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.5206721792477994, "grad_norm": 0.12405235765467854, "learning_rate": 5.596971122701221e-07, "loss": 0.0082, "num_tokens": 25288411.0, "reward": 0.76171875, "reward_std": 0.1633341908454895, "rewards/accuracy_reward/mean": 0.76171875, "rewards/accuracy_reward/std": 0.4268665909767151, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 650.7734375, "completions/mean_terminated_length": 628.5952758789062, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.5249399839957322, "grad_norm": 0.15044450960162567, "learning_rate": 5.522642316338268e-07, "loss": 0.0022, "num_tokens": 25488977.0, "reward": 0.640625, "reward_std": 0.15190494060516357, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2048.0, "completions/mean_length": 706.6328125, "completions/mean_terminated_length": 685.34130859375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.529207788743665, "grad_norm": 0.21210650194517838, "learning_rate": 5.448196544517167e-07, "loss": 0.0254, "num_tokens": 25707379.0, "reward": 0.578125, "reward_std": 0.2531664967536926, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.49482619762420654, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 797.671875, "completions/mean_terminated_length": 746.8455200195312, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.5334755934915978, "grad_norm": 0.1882804309255771, "learning_rate": 5.373650467932121e-07, "loss": 0.0148, "num_tokens": 25947999.0, "reward": 0.54296875, "reward_std": 0.26158764958381653, "rewards/accuracy_reward/mean": 0.54296875, "rewards/accuracy_reward/std": 0.4991260766983032, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 691.6875, "completions/mean_terminated_length": 659.1360473632812, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.5377433982395305, "grad_norm": 0.1922973128812697, "learning_rate": 5.299020769725171e-07, "loss": -0.0067, "num_tokens": 26169399.0, "reward": 0.59765625, "reward_std": 0.25118574500083923, "rewards/accuracy_reward/mean": 0.59765625, "rewards/accuracy_reward/std": 0.4913311004638672, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 620.89453125, "completions/mean_terminated_length": 609.657470703125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.5420112029874633, "grad_norm": 0.11202992839540406, "learning_rate": 5.224324151752575e-07, "loss": -0.0027, "num_tokens": 26363860.0, "reward": 0.6953125, "reward_std": 0.14716076850891113, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1596.0, "completions/mean_length": 791.625, "completions/mean_terminated_length": 766.59765625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5462790077353961, "grad_norm": 0.15263335447293108, "learning_rate": 5.149577330846992e-07, "loss": 0.0263, "num_tokens": 26603028.0, "reward": 0.578125, "reward_std": 0.17662060260772705, "rewards/accuracy_reward/mean": 0.5967742204666138, "rewards/accuracy_reward/std": 0.4915373921394348, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 823.96875, "completions/mean_terminated_length": 794.592041015625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.5505468124833289, "grad_norm": 0.15455545451406247, "learning_rate": 5.074797035076318e-07, "loss": 0.0167, "num_tokens": 26849420.0, "reward": 0.55859375, "reward_std": 0.2684224247932434, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 814.3046875, "completions/mean_terminated_length": 784.696044921875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.5548146172312617, "grad_norm": 0.1451854352321105, "learning_rate": 5e-07, "loss": -0.0009, "num_tokens": 27101234.0, "reward": 0.46484375, "reward_std": 0.17768390476703644, "rewards/accuracy_reward/mean": 0.46484375, "rewards/accuracy_reward/std": 0.49973952770233154, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 652.73046875, "completions/mean_terminated_length": 624.936279296875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5590824219791944, "grad_norm": 0.16448043370772392, "learning_rate": 4.925202964923683e-07, "loss": 0.0494, "num_tokens": 27307317.0, "reward": 0.734375, "reward_std": 0.17688506841659546, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 799.171875, "completions/mean_terminated_length": 753.6680297851562, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.5633502267271272, "grad_norm": 0.15215520006680078, "learning_rate": 4.850422669153009e-07, "loss": 0.0332, "num_tokens": 27550865.0, "reward": 0.6015625, "reward_std": 0.25289812684059143, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 671.6640625, "completions/mean_terminated_length": 638.6320190429688, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.5676180314750601, "grad_norm": 0.17982488060357216, "learning_rate": 4.775675848247427e-07, "loss": 0.0252, "num_tokens": 27755795.0, "reward": 0.69921875, "reward_std": 0.24670997262001038, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 727.27734375, "completions/mean_terminated_length": 695.5800170898438, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5718858362229928, "grad_norm": 0.18005160823929492, "learning_rate": 4.700979230274829e-07, "loss": 0.0189, "num_tokens": 27983106.0, "reward": 0.55078125, "reward_std": 0.20384502410888672, "rewards/accuracy_reward/mean": 0.55078125, "rewards/accuracy_reward/std": 0.49838894605636597, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 689.609375, "completions/mean_terminated_length": 662.5498046875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.5761536409709256, "grad_norm": 0.14230587148619128, "learning_rate": 4.626349532067879e-07, "loss": 0.0063, "num_tokens": 28208398.0, "reward": 0.5859375, "reward_std": 0.19503436982631683, "rewards/accuracy_reward/mean": 0.5859375, "rewards/accuracy_reward/std": 0.4935242533683777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1827.0, "completions/mean_length": 699.90234375, "completions/mean_terminated_length": 678.5040283203125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.5804214457188583, "grad_norm": 0.20157881415963988, "learning_rate": 4.5518034554828327e-07, "loss": 0.0413, "num_tokens": 28418677.0, "reward": 0.60546875, "reward_std": 0.26501142978668213, "rewards/accuracy_reward/mean": 0.60546875, "rewards/accuracy_reward/std": 0.48970720171928406, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 723.7890625, "completions/mean_terminated_length": 702.7698974609375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5846892504667911, "grad_norm": 0.16773141819498977, "learning_rate": 4.477357683661733e-07, "loss": 0.0307, "num_tokens": 28645927.0, "reward": 0.53515625, "reward_std": 0.21238481998443604, "rewards/accuracy_reward/mean": 0.53515625, "rewards/accuracy_reward/std": 0.49973952770233154, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1852.0, "completions/mean_length": 727.3828125, "completions/mean_terminated_length": 701.0757446289062, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.588957055214724, "grad_norm": 0.17450962048980123, "learning_rate": 4.403028877298779e-07, "loss": 0.041, "num_tokens": 28871649.0, "reward": 0.51953125, "reward_std": 0.23330485820770264, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 762.8828125, "completions/mean_terminated_length": 716.0567016601562, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.5932248599626567, "grad_norm": 0.1457325565330749, "learning_rate": 4.328833670911724e-07, "loss": 0.0343, "num_tokens": 29103011.0, "reward": 0.515625, "reward_std": 0.19503435492515564, "rewards/accuracy_reward/mean": 0.5322580933570862, "rewards/accuracy_reward/std": 0.4999673366546631, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 738.21875, "completions/mean_terminated_length": 695.9677124023438, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5974926647105895, "grad_norm": 0.16245529520864127, "learning_rate": 4.254788669119127e-07, "loss": 0.0494, "num_tokens": 29329851.0, "reward": 0.6171875, "reward_std": 0.25658145546913147, "rewards/accuracy_reward/mean": 0.6171875, "rewards/accuracy_reward/std": 0.48702529072761536, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 723.453125, "completions/mean_terminated_length": 723.453125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.6017604694585222, "grad_norm": 0.15294716618010246, "learning_rate": 4.180910442924311e-07, "loss": 0.0056, "num_tokens": 29552751.0, "reward": 0.61328125, "reward_std": 0.2106798142194748, "rewards/accuracy_reward/mean": 0.61328125, "rewards/accuracy_reward/std": 0.4879522919654846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 717.20703125, "completions/mean_terminated_length": 690.6972045898438, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.6060282742064551, "grad_norm": 0.16792896574939836, "learning_rate": 4.107215526006817e-07, "loss": 0.021, "num_tokens": 29769820.0, "reward": 0.5078125, "reward_std": 0.2486819177865982, "rewards/accuracy_reward/mean": 0.5078125, "rewards/accuracy_reward/std": 0.5009182691574097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 682.1875, "completions/mean_terminated_length": 671.4330444335938, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6102960789543879, "grad_norm": 0.150865313084476, "learning_rate": 4.0337204110222347e-07, "loss": 0.0116, "num_tokens": 29980868.0, "reward": 0.6953125, "reward_std": 0.18884865939617157, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 685.66015625, "completions/mean_terminated_length": 664.0357666015625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6145638837023206, "grad_norm": 0.1953752730366482, "learning_rate": 3.960441545911204e-07, "loss": 0.0241, "num_tokens": 30194781.0, "reward": 0.609375, "reward_std": 0.226848304271698, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48884621262550354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 713.00390625, "completions/mean_terminated_length": 691.8135375976562, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.6188316884502534, "grad_norm": 0.15612651886441767, "learning_rate": 3.8873953302184283e-07, "loss": 0.0161, "num_tokens": 30419150.0, "reward": 0.50390625, "reward_std": 0.1810988485813141, "rewards/accuracy_reward/mean": 0.50390625, "rewards/accuracy_reward/std": 0.5009641647338867, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 696.49609375, "completions/mean_terminated_length": 685.8543090820312, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6230994931981861, "grad_norm": 0.14637261498886597, "learning_rate": 3.814598111422513e-07, "loss": 0.024, "num_tokens": 30635453.0, "reward": 0.65625, "reward_std": 0.20357908308506012, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47588926553726196, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1894.0, "completions/mean_length": 759.41796875, "completions/mean_terminated_length": 701.563232421875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.627367297946119, "grad_norm": 0.20463532100644813, "learning_rate": 3.742066181277457e-07, "loss": 0.0394, "num_tokens": 30872104.0, "reward": 0.51953125, "reward_std": 0.28117847442626953, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 789.453125, "completions/mean_terminated_length": 754.072265625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6316351026940518, "grad_norm": 0.15130445176879567, "learning_rate": 3.669815772166625e-07, "loss": 0.0299, "num_tokens": 31115084.0, "reward": 0.48828125, "reward_std": 0.2386942058801651, "rewards/accuracy_reward/mean": 0.48828125, "rewards/accuracy_reward/std": 0.5008418560028076, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 695.59375, "completions/mean_terminated_length": 684.9448852539062, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.6359029074419845, "grad_norm": 0.17477379892719042, "learning_rate": 3.5978630534699865e-07, "loss": 0.0265, "num_tokens": 31332292.0, "reward": 0.5625, "reward_std": 0.25658145546913147, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49705013632774353, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 712.81640625, "completions/mean_terminated_length": 686.2191162109375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6401707121899173, "grad_norm": 0.18543376646343887, "learning_rate": 3.526224127945478e-07, "loss": 0.0215, "num_tokens": 31550877.0, "reward": 0.640625, "reward_std": 0.25487154722213745, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 604.0859375, "completions/mean_terminated_length": 592.716552734375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.6444385169378501, "grad_norm": 0.18278541964037912, "learning_rate": 3.454915028125263e-07, "loss": 0.0299, "num_tokens": 31745163.0, "reward": 0.73046875, "reward_std": 0.2293570339679718, "rewards/accuracy_reward/mean": 0.73046875, "rewards/accuracy_reward/std": 0.44458550214767456, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 649.3671875, "completions/mean_terminated_length": 649.3671875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.6487063216857829, "grad_norm": 2.653852717153093, "learning_rate": 3.3839517127277004e-07, "loss": 0.0311, "num_tokens": 31947089.0, "reward": 0.6171875, "reward_std": 0.2026655077934265, "rewards/accuracy_reward/mean": 0.6171875, "rewards/accuracy_reward/std": 0.48702529072761536, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 692.83203125, "completions/mean_terminated_length": 631.9877319335938, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6529741264337157, "grad_norm": 0.14274698223421514, "learning_rate": 3.31335006308585e-07, "loss": 0.0308, "num_tokens": 32173766.0, "reward": 0.57421875, "reward_std": 0.14058801531791687, "rewards/accuracy_reward/mean": 0.57421875, "rewards/accuracy_reward/std": 0.49542948603630066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 711.15625, "completions/mean_terminated_length": 651.1346435546875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6572419311816484, "grad_norm": 0.13873122233103782, "learning_rate": 3.243125879593286e-07, "loss": 0.0217, "num_tokens": 32395510.0, "reward": 0.64453125, "reward_std": 0.14992907643318176, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 670.67578125, "completions/mean_terminated_length": 659.8306884765625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6615097359295812, "grad_norm": 0.16194091457756485, "learning_rate": 3.173294878168025e-07, "loss": 0.02, "num_tokens": 32603435.0, "reward": 0.6875, "reward_std": 0.22738119959831238, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4644203782081604, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 668.96875, "completions/mean_terminated_length": 641.498046875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.665777540677514, "grad_norm": 0.20775701511344793, "learning_rate": 3.1038726867353583e-07, "loss": 0.0147, "num_tokens": 32815211.0, "reward": 0.609375, "reward_std": 0.24382543563842773, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48884621262550354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 738.48828125, "completions/mean_terminated_length": 733.3529663085938, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.6700453454254468, "grad_norm": 0.16412376814421997, "learning_rate": 3.034874841730382e-07, "loss": 0.0326, "num_tokens": 33039328.0, "reward": 0.61328125, "reward_std": 0.19727861881256104, "rewards/accuracy_reward/mean": 0.61328125, "rewards/accuracy_reward/std": 0.4879522919654846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 663.28515625, "completions/mean_terminated_length": 618.616943359375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.6743131501733796, "grad_norm": 0.13990368182253585, "learning_rate": 2.9663167846209996e-07, "loss": 0.0185, "num_tokens": 33246481.0, "reward": 0.62109375, "reward_std": 0.16807834804058075, "rewards/accuracy_reward/mean": 0.62109375, "rewards/accuracy_reward/std": 0.4860650300979614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 720.7421875, "completions/mean_terminated_length": 705.0039672851562, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.6785809549213123, "grad_norm": 0.21349501848091973, "learning_rate": 2.898213858452173e-07, "loss": 0.0235, "num_tokens": 33476039.0, "reward": 0.56640625, "reward_std": 0.30327552556991577, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 698.4375, "completions/mean_terminated_length": 666.0480346679688, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6828487596692452, "grad_norm": 2.7101145922860264, "learning_rate": 2.8305813044122093e-07, "loss": 0.0384, "num_tokens": 33691495.0, "reward": 0.640625, "reward_std": 0.18623006343841553, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 695.64453125, "completions/mean_terminated_length": 684.9960327148438, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6871165644171779, "grad_norm": 0.19246224673247156, "learning_rate": 2.763434258421836e-07, "loss": 0.0258, "num_tokens": 33903004.0, "reward": 0.63671875, "reward_std": 0.2394905686378479, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.48188701272010803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 787.1015625, "completions/mean_terminated_length": 772.1502075195312, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.6913843691651107, "grad_norm": 0.17498122004000508, "learning_rate": 2.696787747746839e-07, "loss": 0.0354, "num_tokens": 34144806.0, "reward": 0.5078125, "reward_std": 0.2318657785654068, "rewards/accuracy_reward/mean": 0.5078125, "rewards/accuracy_reward/std": 0.5009182691574097, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 691.89453125, "completions/mean_terminated_length": 642.4818115234375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.6956521739130435, "grad_norm": 0.15967496784346374, "learning_rate": 2.6306566876350067e-07, "loss": 0.0105, "num_tokens": 34367667.0, "reward": 0.64453125, "reward_std": 0.1528160572052002, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 709.24609375, "completions/mean_terminated_length": 682.5776977539062, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.6999199786609762, "grad_norm": 0.16783695896488116, "learning_rate": 2.5650558779781635e-07, "loss": 0.0212, "num_tokens": 34586250.0, "reward": 0.5546875, "reward_std": 0.23448437452316284, "rewards/accuracy_reward/mean": 0.5546875, "rewards/accuracy_reward/std": 0.49797385931015015, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 641.84765625, "completions/mean_terminated_length": 630.7755737304688, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.7041877834089091, "grad_norm": 0.13916296254919422, "learning_rate": 2.500000000000001e-07, "loss": 0.0377, "num_tokens": 34785107.0, "reward": 0.640625, "reward_std": 0.18477579951286316, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 698.90625, "completions/mean_terminated_length": 682.9091186523438, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7084555881568418, "grad_norm": 0.16427060939231672, "learning_rate": 2.4355036129704696e-07, "loss": 0.022, "num_tokens": 35000283.0, "reward": 0.55859375, "reward_std": 0.26143547892570496, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1919.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 690.453125, "completions/mean_terminated_length": 690.453125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.7127233929047746, "grad_norm": 0.16192550263946426, "learning_rate": 2.371581150947476e-07, "loss": 0.003, "num_tokens": 35210223.0, "reward": 0.5859375, "reward_std": 0.18675412237644196, "rewards/accuracy_reward/mean": 0.5859375, "rewards/accuracy_reward/std": 0.4935242533683777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 723.69140625, "completions/mean_terminated_length": 697.310791015625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7169911976527074, "grad_norm": 0.16721621945604426, "learning_rate": 2.3082469195465893e-07, "loss": -0.0017, "num_tokens": 35434304.0, "reward": 0.60546875, "reward_std": 0.22396379709243774, "rewards/accuracy_reward/mean": 0.60546875, "rewards/accuracy_reward/std": 0.48970720171928406, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 615.58203125, "completions/mean_terminated_length": 604.3031616210938, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7212590024006402, "grad_norm": 0.18161777198423945, "learning_rate": 2.2455150927394878e-07, "loss": 0.0457, "num_tokens": 35625213.0, "reward": 0.6640625, "reward_std": 0.23395150899887085, "rewards/accuracy_reward/mean": 0.6640625, "rewards/accuracy_reward/std": 0.4732423722743988, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 633.375, "completions/mean_terminated_length": 622.2362060546875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.725526807148573, "grad_norm": 0.15728519931429938, "learning_rate": 2.1833997096818895e-07, "loss": 0.0157, "num_tokens": 35830397.0, "reward": 0.703125, "reward_std": 0.21778053045272827, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.45777595043182373, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 710.61328125, "completions/mean_terminated_length": 667.4717407226562, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.7297946118965057, "grad_norm": 0.19673166776660161, "learning_rate": 2.121914671571633e-07, "loss": 0.0184, "num_tokens": 36053362.0, "reward": 0.5859375, "reward_std": 0.2480328232049942, "rewards/accuracy_reward/mean": 0.5859375, "rewards/accuracy_reward/std": 0.4935242533683777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1657.0, "completions/mean_length": 645.01953125, "completions/mean_terminated_length": 628.3834228515625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.7340624166444385, "grad_norm": 0.18662654855006655, "learning_rate": 2.0610737385376348e-07, "loss": 0.0149, "num_tokens": 36257215.0, "reward": 0.625, "reward_std": 0.2225247174501419, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4850712716579437, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 729.95703125, "completions/mean_terminated_length": 724.7882690429688, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7383302213923713, "grad_norm": 0.15914760867061006, "learning_rate": 2.0008905265604315e-07, "loss": 0.016, "num_tokens": 36487676.0, "reward": 0.61328125, "reward_std": 0.21397364139556885, "rewards/accuracy_reward/mean": 0.61328125, "rewards/accuracy_reward/std": 0.4879522919654846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 512.953125, "completions/mean_terminated_length": 506.933349609375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7425980261403041, "grad_norm": 0.21401549472539927, "learning_rate": 1.9413785044249676e-07, "loss": 0.0099, "num_tokens": 36652152.0, "reward": 0.828125, "reward_std": 0.22962790727615356, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 823.72265625, "completions/mean_terminated_length": 799.3346557617188, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.7468658308882369, "grad_norm": 0.1935931177350886, "learning_rate": 1.8825509907063326e-07, "loss": 0.0157, "num_tokens": 36909769.0, "reward": 0.546875, "reward_std": 0.26394912600517273, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4987730085849762, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1846.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 588.75390625, "completions/mean_terminated_length": 588.75390625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.7511336356361696, "grad_norm": 0.18692109373962446, "learning_rate": 1.824421150789106e-07, "loss": 0.0363, "num_tokens": 37094778.0, "reward": 0.71484375, "reward_std": 0.22278673946857452, "rewards/accuracy_reward/mean": 0.71484375, "rewards/accuracy_reward/std": 0.4523732364177704, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1952.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 611.62109375, "completions/mean_terminated_length": 611.62109375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.7554014403841024, "grad_norm": 0.19122585153150984, "learning_rate": 1.7670019939210023e-07, "loss": 0.0113, "num_tokens": 37296913.0, "reward": 0.63671875, "reward_std": 0.1732081174850464, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.48188701272010803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 737.125, "completions/mean_terminated_length": 726.8031616210938, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7596692451320353, "grad_norm": 0.20920060817748004, "learning_rate": 1.710306370301437e-07, "loss": 0.0334, "num_tokens": 37521321.0, "reward": 0.61328125, "reward_std": 0.29116860032081604, "rewards/accuracy_reward/mean": 0.61328125, "rewards/accuracy_reward/std": 0.4879522919654846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 826.265625, "completions/mean_terminated_length": 781.7490234375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.763937049879968, "grad_norm": 0.16926216445641215, "learning_rate": 1.6543469682057104e-07, "loss": 0.0227, "num_tokens": 37767125.0, "reward": 0.47265625, "reward_std": 0.2974705398082733, "rewards/accuracy_reward/mean": 0.47265625, "rewards/accuracy_reward/std": 0.5002297759056091, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1650.0, "completions/mean_length": 621.671875, "completions/mean_terminated_length": 610.44091796875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7682048546279008, "grad_norm": 0.25834397337634973, "learning_rate": 1.599136311145402e-07, "loss": 0.0052, "num_tokens": 37958313.0, "reward": 0.63671875, "reward_std": 0.2747243642807007, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.48188701272010803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 631.9296875, "completions/mean_terminated_length": 609.452392578125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.7724726593758335, "grad_norm": 0.21432978221288052, "learning_rate": 1.5446867550656767e-07, "loss": 0.0292, "num_tokens": 38156599.0, "reward": 0.671875, "reward_std": 0.27328526973724365, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.47045037150382996, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 752.87890625, "completions/mean_terminated_length": 716.4698486328125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.7767404641237663, "grad_norm": 0.17437061893863778, "learning_rate": 1.4910104855800426e-07, "loss": 0.0221, "num_tokens": 38383904.0, "reward": 0.6328125, "reward_std": 0.19780902564525604, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 732.80859375, "completions/mean_terminated_length": 717.2134399414062, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.7810082688716992, "grad_norm": 0.1598318880580154, "learning_rate": 1.4381195152432769e-07, "loss": 0.0221, "num_tokens": 38611175.0, "reward": 0.6328125, "reward_std": 0.2296190708875656, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 717.453125, "completions/mean_terminated_length": 690.9482421875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7852760736196319, "grad_norm": 0.18863795184539878, "learning_rate": 1.3860256808630427e-07, "loss": 0.0162, "num_tokens": 38834187.0, "reward": 0.5390625, "reward_std": 0.21158601343631744, "rewards/accuracy_reward/mean": 0.5390625, "rewards/accuracy_reward/std": 0.4994482398033142, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 652.3125, "completions/mean_terminated_length": 635.7628784179688, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7895438783675647, "grad_norm": 0.20121677777375388, "learning_rate": 1.3347406408508694e-07, "loss": 0.0113, "num_tokens": 39035419.0, "reward": 0.55859375, "reward_std": 0.26382553577423096, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 618.82421875, "completions/mean_terminated_length": 584.5240478515625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7938116831154974, "grad_norm": 0.16491042479146237, "learning_rate": 1.284275872613028e-07, "loss": 0.0368, "num_tokens": 39231782.0, "reward": 0.6328125, "reward_std": 0.1459837257862091, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 635.5625, "completions/mean_terminated_length": 607.4263305664062, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.7980794878634303, "grad_norm": 0.23455442334789772, "learning_rate": 1.2346426699819456e-07, "loss": 0.0442, "num_tokens": 39433758.0, "reward": 0.68359375, "reward_std": 0.19989721477031708, "rewards/accuracy_reward/mean": 0.68359375, "rewards/accuracy_reward/std": 0.4659844934940338, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 713.671875, "completions/mean_terminated_length": 692.4921264648438, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.8023472926113631, "grad_norm": 0.15756234156819612, "learning_rate": 1.1858521406886674e-07, "loss": -0.0005, "num_tokens": 39666066.0, "reward": 0.50390625, "reward_std": 0.1810988485813141, "rewards/accuracy_reward/mean": 0.50390625, "rewards/accuracy_reward/std": 0.5009641647338867, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 776.61328125, "completions/mean_terminated_length": 719.5305786132812, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.8066150973592958, "grad_norm": 0.18804973381454748, "learning_rate": 1.1379152038770029e-07, "loss": 0.0219, "num_tokens": 39908447.0, "reward": 0.4765625, "reward_std": 0.25513601303100586, "rewards/accuracy_reward/mean": 0.4765625, "rewards/accuracy_reward/std": 0.5004287362098694, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1898.0, "completions/mean_length": 791.19921875, "completions/mean_terminated_length": 761.0360107421875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.8108829021072286, "grad_norm": 0.16325751022804733, "learning_rate": 1.090842587659851e-07, "loss": 0.0221, "num_tokens": 40146162.0, "reward": 0.53125, "reward_std": 0.24041050672531128, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 606.69921875, "completions/mean_terminated_length": 601.047119140625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8151507068551613, "grad_norm": 0.1573394710102458, "learning_rate": 1.044644826718295e-07, "loss": 0.0311, "num_tokens": 40331989.0, "reward": 0.734375, "reward_std": 0.17347405850887299, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 645.078125, "completions/mean_terminated_length": 617.1314697265625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.8194185116030942, "grad_norm": 0.4096923043938982, "learning_rate": 9.99332259943969e-08, "loss": 0.0204, "num_tokens": 40531993.0, "reward": 0.6484375, "reward_std": 0.25567278265953064, "rewards/accuracy_reward/mean": 0.6484375, "rewards/accuracy_reward/std": 0.47839346528053284, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 692.7109375, "completions/mean_terminated_length": 665.7131958007812, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.823686316351027, "grad_norm": 0.18040801058119468, "learning_rate": 9.549150281252632e-08, "loss": 0.002, "num_tokens": 40743095.0, "reward": 0.60546875, "reward_std": 0.2296215295791626, "rewards/accuracy_reward/mean": 0.60546875, "rewards/accuracy_reward/std": 0.48970720171928406, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 665.02734375, "completions/mean_terminated_length": 643.075439453125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.8279541210989597, "grad_norm": 0.17019939572706502, "learning_rate": 9.114030716778432e-08, "loss": 0.0224, "num_tokens": 40947638.0, "reward": 0.57421875, "reward_std": 0.1618887335062027, "rewards/accuracy_reward/mean": 0.57421875, "rewards/accuracy_reward/std": 0.49542948603630066, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1628.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 726.453125, "completions/mean_terminated_length": 726.453125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8322219258468925, "grad_norm": 0.19470897147732638, "learning_rate": 8.688061284200265e-08, "loss": 0.0125, "num_tokens": 41171050.0, "reward": 0.5703125, "reward_std": 0.2899891138076782, "rewards/accuracy_reward/mean": 0.5703125, "rewards/accuracy_reward/std": 0.4960011839866638, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 715.34375, "completions/mean_terminated_length": 694.1904907226562, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8364897305948252, "grad_norm": 0.16961140906916883, "learning_rate": 8.271337313934867e-08, "loss": 0.0199, "num_tokens": 41394090.0, "reward": 0.6015625, "reward_std": 0.21147862076759338, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 753.76953125, "completions/mean_terminated_length": 717.385498046875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.8407575353427581, "grad_norm": 0.14267834470354487, "learning_rate": 7.863952067298041e-08, "loss": -0.004, "num_tokens": 41621135.0, "reward": 0.68359375, "reward_std": 0.1810988485813141, "rewards/accuracy_reward/mean": 0.68359375, "rewards/accuracy_reward/std": 0.4659844934940338, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 717.44921875, "completions/mean_terminated_length": 696.3294067382812, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8450253400906909, "grad_norm": 0.18522820075393684, "learning_rate": 7.465996715633027e-08, "loss": 0.0201, "num_tokens": 41845914.0, "reward": 0.59375, "reward_std": 0.3056321144104004, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49209436774253845, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 703.875, "completions/mean_terminated_length": 698.6039428710938, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8492931448386236, "grad_norm": 0.17512294585566626, "learning_rate": 7.077560319906694e-08, "loss": 0.0129, "num_tokens": 42060298.0, "reward": 0.60546875, "reward_std": 0.1737360954284668, "rewards/accuracy_reward/mean": 0.60546875, "rewards/accuracy_reward/std": 0.48970720171928406, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 686.66015625, "completions/mean_terminated_length": 648.3895263671875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.8535609495865564, "grad_norm": 0.12460188705163991, "learning_rate": 6.698729810778064e-08, "loss": 0.0108, "num_tokens": 42271123.0, "reward": 0.68359375, "reward_std": 0.135463148355484, "rewards/accuracy_reward/mean": 0.68359375, "rewards/accuracy_reward/std": 0.4659844934940338, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 628.1796875, "completions/mean_terminated_length": 582.3790283203125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8578287543344892, "grad_norm": 1.1717296106182498, "learning_rate": 6.329589969143517e-08, "loss": 0.0238, "num_tokens": 42468385.0, "reward": 0.640625, "reward_std": 0.2137165069580078, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 730.0546875, "completions/mean_terminated_length": 698.4240112304688, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.862096559082422, "grad_norm": 0.15656348367686632, "learning_rate": 5.9702234071631e-08, "loss": 0.0016, "num_tokens": 42693055.0, "reward": 0.56640625, "reward_std": 0.17635467648506165, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 724.73046875, "completions/mean_terminated_length": 692.9720458984375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.8663643638303548, "grad_norm": 0.1459269992065347, "learning_rate": 5.620710549772295e-08, "loss": 0.0421, "num_tokens": 42912986.0, "reward": 0.59375, "reward_std": 0.16860876977443695, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49209436774253845, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 693.640625, "completions/mean_terminated_length": 672.1428833007812, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8706321685782875, "grad_norm": 0.1809249668666066, "learning_rate": 5.2811296166831666e-08, "loss": 0.0218, "num_tokens": 43125238.0, "reward": 0.61328125, "reward_std": 0.24606087803840637, "rewards/accuracy_reward/mean": 0.61328125, "rewards/accuracy_reward/std": 0.4879522919654846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 711.7578125, "completions/mean_terminated_length": 695.9130859375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.8748999733262203, "grad_norm": 0.20158349817144566, "learning_rate": 4.951556604879048e-08, "loss": 0.0166, "num_tokens": 43348424.0, "reward": 0.58984375, "reward_std": 0.27434366941452026, "rewards/accuracy_reward/mean": 0.58984375, "rewards/accuracy_reward/std": 0.49282538890838623, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 723.2421875, "completions/mean_terminated_length": 686.0, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8791677780741531, "grad_norm": 0.16555289267225737, "learning_rate": 4.6320652716067555e-08, "loss": 0.027, "num_tokens": 43566182.0, "reward": 0.58203125, "reward_std": 0.25540196895599365, "rewards/accuracy_reward/mean": 0.58203125, "rewards/accuracy_reward/std": 0.49419113993644714, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 716.3203125, "completions/mean_terminated_length": 689.7928466796875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.8834355828220859, "grad_norm": 0.15251148614367968, "learning_rate": 4.322727117869951e-08, "loss": 0.0125, "num_tokens": 43784416.0, "reward": 0.66796875, "reward_std": 0.21633265912532806, "rewards/accuracy_reward/mean": 0.66796875, "rewards/accuracy_reward/std": 0.4718646705150604, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 688.9609375, "completions/mean_terminated_length": 667.388916015625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.8877033875700187, "grad_norm": 0.17443255375688374, "learning_rate": 4.023611372427471e-08, "loss": 0.0037, "num_tokens": 44001166.0, "reward": 0.625, "reward_std": 0.2396092414855957, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4850712716579437, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 704.53515625, "completions/mean_terminated_length": 688.6047973632812, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8919711923179514, "grad_norm": 0.16853797409498864, "learning_rate": 3.734784976300165e-08, "loss": 0.0163, "num_tokens": 44226015.0, "reward": 0.5390625, "reward_std": 0.25091981887817383, "rewards/accuracy_reward/mean": 0.5390625, "rewards/accuracy_reward/std": 0.4994482398033142, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 738.59375, "completions/mean_terminated_length": 717.8095703125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.8962389970658843, "grad_norm": 0.14429226057527744, "learning_rate": 3.456312567789793e-08, "loss": 0.0026, "num_tokens": 44447887.0, "reward": 0.671875, "reward_std": 0.24396878480911255, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.47045037150382996, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 647.8515625, "completions/mean_terminated_length": 614.248046875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.900506801813817, "grad_norm": 0.21336156797511663, "learning_rate": 3.188256468013139e-08, "loss": 0.0319, "num_tokens": 44648865.0, "reward": 0.6640625, "reward_std": 0.23357079923152924, "rewards/accuracy_reward/mean": 0.6640625, "rewards/accuracy_reward/std": 0.4732423722743988, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 732.5859375, "completions/mean_terminated_length": 690.1531982421875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9047746065617498, "grad_norm": 0.1594541338224898, "learning_rate": 2.9306766669548457e-08, "loss": 0.0178, "num_tokens": 44879447.0, "reward": 0.51953125, "reward_std": 0.21660104393959045, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 666.71484375, "completions/mean_terminated_length": 644.7897338867188, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.9090424113096826, "grad_norm": 0.13402021740421238, "learning_rate": 2.6836308100417872e-08, "loss": 0.0209, "num_tokens": 45084174.0, "reward": 0.60546875, "reward_std": 0.1621571183204651, "rewards/accuracy_reward/mean": 0.60546875, "rewards/accuracy_reward/std": 0.48970720171928406, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 640.21484375, "completions/mean_terminated_length": 600.6385498046875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.9133102160576153, "grad_norm": 0.16506792868914458, "learning_rate": 2.4471741852423233e-08, "loss": 0.0484, "num_tokens": 45280933.0, "reward": 0.76953125, "reward_std": 0.16439500451087952, "rewards/accuracy_reward/mean": 0.76953125, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 743.671875, "completions/mean_terminated_length": 717.6892700195312, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.9175780208055482, "grad_norm": 0.152763201379616, "learning_rate": 2.2213597106929605e-08, "loss": 0.0177, "num_tokens": 45510841.0, "reward": 0.6015625, "reward_std": 0.19503435492515564, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 733.44140625, "completions/mean_terminated_length": 707.2550048828125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9218458255534809, "grad_norm": 0.1645677770956383, "learning_rate": 2.0062379228555525e-08, "loss": -0.002, "num_tokens": 45732058.0, "reward": 0.67578125, "reward_std": 0.22973774373531342, "rewards/accuracy_reward/mean": 0.67578125, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1940.0, "completions/mean_length": 687.875, "completions/mean_terminated_length": 677.1653442382812, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.9261136303014137, "grad_norm": 0.24378981471372618, "learning_rate": 1.8018569652073378e-08, "loss": 0.0226, "num_tokens": 45946338.0, "reward": 0.65234375, "reward_std": 0.24302664399147034, "rewards/accuracy_reward/mean": 0.65234375, "rewards/accuracy_reward/std": 0.4771590530872345, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 725.7890625, "completions/mean_terminated_length": 683.1370849609375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.9303814350493465, "grad_norm": 0.16104351367263942, "learning_rate": 1.6082625774666792e-08, "loss": 0.0192, "num_tokens": 46169276.0, "reward": 0.58203125, "reward_std": 0.17662307620048523, "rewards/accuracy_reward/mean": 0.58203125, "rewards/accuracy_reward/std": 0.49419113993644714, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 750.88671875, "completions/mean_terminated_length": 681.4938354492188, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.9346492397972793, "grad_norm": 0.15362390550001964, "learning_rate": 1.4254980853566246e-08, "loss": 0.0354, "num_tokens": 46399943.0, "reward": 0.58984375, "reward_std": 0.19279402494430542, "rewards/accuracy_reward/mean": 0.58984375, "rewards/accuracy_reward/std": 0.49282538890838623, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1451.0, "completions/mean_length": 669.7421875, "completions/mean_terminated_length": 636.6640014648438, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.9389170445452121, "grad_norm": 0.13543826848213528, "learning_rate": 1.253604390908819e-08, "loss": 0.0214, "num_tokens": 46605885.0, "reward": 0.58984375, "reward_std": 0.13546313345432281, "rewards/accuracy_reward/mean": 0.58984375, "rewards/accuracy_reward/std": 0.49282538890838623, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 783.8828125, "completions/mean_terminated_length": 778.925537109375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9431848492931448, "grad_norm": 0.17759133312220565, "learning_rate": 1.0926199633097154e-08, "loss": 0.0247, "num_tokens": 46854583.0, "reward": 0.51953125, "reward_std": 0.1726752519607544, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 649.19140625, "completions/mean_terminated_length": 632.6047973632812, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.9474526540410776, "grad_norm": 0.1586721854654231, "learning_rate": 9.425808302913728e-09, "loss": 0.0301, "num_tokens": 47057680.0, "reward": 0.69921875, "reward_std": 0.21371403336524963, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 626.34765625, "completions/mean_terminated_length": 603.7817993164062, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.9517204587890103, "grad_norm": 0.17483755006208948, "learning_rate": 8.035205700685165e-09, "loss": 0.0203, "num_tokens": 47261201.0, "reward": 0.69140625, "reward_std": 0.19556477665901184, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1913.0, "completions/mean_length": 588.92578125, "completions/mean_terminated_length": 565.7659301757812, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.9559882635369432, "grad_norm": 0.16063372689194239, "learning_rate": 6.754703038239329e-09, "loss": 0.0156, "num_tokens": 47445534.0, "reward": 0.79296875, "reward_std": 0.17715102434158325, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 691.62109375, "completions/mean_terminated_length": 675.53759765625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.960256068284876, "grad_norm": 0.22328275656543126, "learning_rate": 5.5845868874357385e-09, "loss": 0.0216, "num_tokens": 47660813.0, "reward": 0.63671875, "reward_std": 0.2789405584335327, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.48188701272010803, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 687.15234375, "completions/mean_terminated_length": 648.8955688476562, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9645238730328087, "grad_norm": 0.1940699434225098, "learning_rate": 4.5251191160326495e-09, "loss": 0.0162, "num_tokens": 47875420.0, "reward": 0.6328125, "reward_std": 0.229887455701828, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 632.83203125, "completions/mean_terminated_length": 604.6414794921875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.9687916777807415, "grad_norm": 0.14202828475012133, "learning_rate": 3.5765368290813223e-09, "loss": 0.0051, "num_tokens": 48070545.0, "reward": 0.73046875, "reward_std": 0.15952971577644348, "rewards/accuracy_reward/mean": 0.73046875, "rewards/accuracy_reward/std": 0.44458550214767456, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 783.6953125, "completions/mean_terminated_length": 742.9112548828125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.9730594825286744, "grad_norm": 1.825730745371174, "learning_rate": 2.739052315863355e-09, "loss": 0.0126, "num_tokens": 48308835.0, "reward": 0.53515625, "reward_std": 0.2778797447681427, "rewards/accuracy_reward/mean": 0.53515625, "rewards/accuracy_reward/std": 0.49973952770233154, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 572.68359375, "completions/mean_terminated_length": 561.0669555664062, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9773272872766071, "grad_norm": 0.17942667026211423, "learning_rate": 2.0128530023804656e-09, "loss": 0.0248, "num_tokens": 48489730.0, "reward": 0.73828125, "reward_std": 0.18911069631576538, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.4404313564300537, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1728.0, "completions/mean_length": 736.47265625, "completions/mean_terminated_length": 688.6842041015625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.9815950920245399, "grad_norm": 0.11589896012509639, "learning_rate": 1.3981014094099353e-09, "loss": 0.004, "num_tokens": 48727051.0, "reward": 0.55859375, "reward_std": 0.14256632328033447, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 731.9765625, "completions/mean_terminated_length": 672.8897705078125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.9858628967724726, "grad_norm": 0.17379082291728887, "learning_rate": 8.949351161324225e-10, "loss": 0.0287, "num_tokens": 48951677.0, "reward": 0.6328125, "reward_std": 0.21686306595802307, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 613.2109375, "completions/mean_terminated_length": 590.4365234375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.9901307015204054, "grad_norm": 0.14136371185185287, "learning_rate": 5.034667293427053e-10, "loss": 0.0157, "num_tokens": 49140963.0, "reward": 0.62109375, "reward_std": 0.15057817101478577, "rewards/accuracy_reward/mean": 0.62109375, "rewards/accuracy_reward/std": 0.4860650300979614, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 718.78515625, "completions/mean_terminated_length": 681.4176635742188, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.9943985062683383, "grad_norm": 0.15367312232291866, "learning_rate": 2.2378385824833866e-10, "loss": 0.0459, "num_tokens": 49355428.0, "reward": 0.671875, "reward_std": 0.15558436512947083, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.47045037150382996, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 515.4459838867188, "completions/mean_terminated_length": 515.4459838867188, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.998666311016271, "grad_norm": 0.16966303678970307, "learning_rate": 5.594909486328348e-11, "loss": 0.0063, "num_tokens": 49525500.0, "reward": 0.7890625, "reward_std": 0.2095002979040146, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 234 }, { "epoch": 0.998666311016271, "step": 234, "total_flos": 0.0, "train_loss": 0.015861935917542785, "train_runtime": 47487.0739, "train_samples_per_second": 0.158, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 234, "num_input_tokens_seen": 49525500, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }