Files
Qwen2.5-3B-grpo/trainer_state.json
ModelHub XC 8116ceb972 初始化项目,由ModelHub XC社区提供模型
Model: lhkhiem28/Qwen2.5-3B-grpo
Source: Original Platform
2026-05-06 09:22:08 +08:00

6128 lines
222 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998666311016271,
"eval_steps": 1,
"global_step": 234,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1695.0,
"completions/mean_length": 494.890625,
"completions/mean_terminated_length": 482.6614074707031,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.004267804747932782,
"grad_norm": 0.2570115373475802,
"learning_rate": 0.0,
"loss": -0.0035,
"num_tokens": 162332.0,
"reward": 0.095703125,
"reward_std": 0.17584297060966492,
"rewards/accuracy_reward/mean": 0.09375,
"rewards/accuracy_reward/std": 0.2920515835285187,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1640.0,
"completions/mean_length": 534.98828125,
"completions/mean_terminated_length": 523.0748291015625,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.008535609495865563,
"grad_norm": 0.24752090505199714,
"learning_rate": 4.166666666666666e-08,
"loss": -0.006,
"num_tokens": 333433.0,
"reward": 0.12109375,
"reward_std": 0.20258089900016785,
"rewards/accuracy_reward/mean": 0.1171875,
"rewards/accuracy_reward/std": 0.3222736418247223,
"rewards/format_reward/mean": 0.0078125,
"rewards/format_reward/std": 0.08821486681699753,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1711.0,
"completions/mean_length": 474.12109375,
"completions/mean_terminated_length": 467.94903564453125,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.012803414243798347,
"grad_norm": 0.353285694760891,
"learning_rate": 8.333333333333333e-08,
"loss": -0.0132,
"num_tokens": 489184.0,
"reward": 0.1796875,
"reward_std": 0.308138370513916,
"rewards/accuracy_reward/mean": 0.1796875,
"rewards/accuracy_reward/std": 0.38467901945114136,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1710.0,
"completions/mean_length": 419.41015625,
"completions/mean_terminated_length": 413.0235595703125,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.017071218991731127,
"grad_norm": 0.3238140041719121,
"learning_rate": 1.25e-07,
"loss": -0.0107,
"num_tokens": 629873.0,
"reward": 0.130859375,
"reward_std": 0.21213515102863312,
"rewards/accuracy_reward/mean": 0.12890625,
"rewards/accuracy_reward/std": 0.33575257658958435,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1873.0,
"completions/mean_length": 506.5234375,
"completions/mean_terminated_length": 488.2450866699219,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.021339023739663912,
"grad_norm": 0.3191673881250607,
"learning_rate": 1.6666666666666665e-07,
"loss": -0.0379,
"num_tokens": 795031.0,
"reward": 0.150390625,
"reward_std": 0.2623838186264038,
"rewards/accuracy_reward/mean": 0.14453125,
"rewards/accuracy_reward/std": 0.35231640934944153,
"rewards/format_reward/mean": 0.01171875,
"rewards/format_reward/std": 0.1078278198838234,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1800.0,
"completions/mean_length": 536.17578125,
"completions/mean_terminated_length": 512.1785888671875,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.025606828487596694,
"grad_norm": 0.23250426520311288,
"learning_rate": 2.0833333333333333e-07,
"loss": -0.0285,
"num_tokens": 967388.0,
"reward": 0.11328125,
"reward_std": 0.19621387124061584,
"rewards/accuracy_reward/mean": 0.11328125,
"rewards/accuracy_reward/std": 0.31755712628364563,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 581.45703125,
"completions/mean_terminated_length": 569.909423828125,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.029874633235529476,
"grad_norm": 0.2570695233430195,
"learning_rate": 2.5e-07,
"loss": -0.0245,
"num_tokens": 1155489.0,
"reward": 0.087890625,
"reward_std": 0.1791757494211197,
"rewards/accuracy_reward/mean": 0.0859375,
"rewards/accuracy_reward/std": 0.28082075715065,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1657.0,
"completions/max_terminated_length": 1657.0,
"completions/mean_length": 448.046875,
"completions/mean_terminated_length": 448.046875,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.034142437983462254,
"grad_norm": 0.3667967373682546,
"learning_rate": 2.916666666666667e-07,
"loss": -0.0086,
"num_tokens": 1306133.0,
"reward": 0.125,
"reward_std": 0.2576282024383545,
"rewards/accuracy_reward/mean": 0.12109375,
"rewards/accuracy_reward/std": 0.3268752694129944,
"rewards/format_reward/mean": 0.0078125,
"rewards/format_reward/std": 0.08821486681699753,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1475.0,
"completions/mean_length": 448.9921875,
"completions/mean_terminated_length": 436.4015808105469,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.03841024273139504,
"grad_norm": 0.41766001020055105,
"learning_rate": 3.333333333333333e-07,
"loss": -0.026,
"num_tokens": 1456275.0,
"reward": 0.171875,
"reward_std": 0.27157536149024963,
"rewards/accuracy_reward/mean": 0.171875,
"rewards/accuracy_reward/std": 0.3780108094215393,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1978.0,
"completions/mean_length": 480.38671875,
"completions/mean_terminated_length": 474.2392578125,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.042678047479327824,
"grad_norm": 0.32972526301694516,
"learning_rate": 3.75e-07,
"loss": -0.064,
"num_tokens": 1610758.0,
"reward": 0.14453125,
"reward_std": 0.23395395278930664,
"rewards/accuracy_reward/mean": 0.14453125,
"rewards/accuracy_reward/std": 0.35231640934944153,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1698.0,
"completions/mean_length": 504.75,
"completions/mean_terminated_length": 492.5984191894531,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.0469458522272606,
"grad_norm": 0.2723514184754716,
"learning_rate": 4.1666666666666667e-07,
"loss": -0.0389,
"num_tokens": 1775486.0,
"reward": 0.123046875,
"reward_std": 0.2098972499370575,
"rewards/accuracy_reward/mean": 0.12109375,
"rewards/accuracy_reward/std": 0.3268752694129944,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1807.0,
"completions/mean_length": 490.03515625,
"completions/mean_terminated_length": 477.7677001953125,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.05121365697519339,
"grad_norm": 0.9124980782540163,
"learning_rate": 4.5833333333333327e-07,
"loss": -0.0169,
"num_tokens": 1936959.0,
"reward": 0.12890625,
"reward_std": 0.21988865733146667,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.33136674761772156,
"rewards/format_reward/mean": 0.0078125,
"rewards/format_reward/std": 0.08821486681699753,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1770.0,
"completions/max_terminated_length": 1770.0,
"completions/mean_length": 571.9609375,
"completions/mean_terminated_length": 571.9609375,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.055481461723126166,
"grad_norm": 0.27097928837090157,
"learning_rate": 5e-07,
"loss": -0.0412,
"num_tokens": 2122461.0,
"reward": 0.15625,
"reward_std": 0.25407516956329346,
"rewards/accuracy_reward/mean": 0.15625,
"rewards/accuracy_reward/std": 0.3638034462928772,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1980.0,
"completions/mean_length": 543.5546875,
"completions/mean_terminated_length": 531.7086791992188,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.05974926647105895,
"grad_norm": 0.2412268039467004,
"learning_rate": 5.416666666666666e-07,
"loss": 0.0021,
"num_tokens": 2293243.0,
"reward": 0.1328125,
"reward_std": 0.20843946933746338,
"rewards/accuracy_reward/mean": 0.1328125,
"rewards/accuracy_reward/std": 0.3400367796421051,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 499.04296875,
"completions/mean_terminated_length": 474.4563903808594,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.06401707121899174,
"grad_norm": 0.27402845211677135,
"learning_rate": 5.833333333333334e-07,
"loss": -0.0334,
"num_tokens": 2454966.0,
"reward": 0.173828125,
"reward_std": 0.24171365797519684,
"rewards/accuracy_reward/mean": 0.171875,
"rewards/accuracy_reward/std": 0.3780108094215393,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1818.0,
"completions/mean_length": 517.296875,
"completions/mean_terminated_length": 486.8048095703125,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.06828487596692451,
"grad_norm": 0.27961598068647736,
"learning_rate": 6.249999999999999e-07,
"loss": -0.0055,
"num_tokens": 2620058.0,
"reward": 0.16015625,
"reward_std": 0.2830517292022705,
"rewards/accuracy_reward/mean": 0.15234375,
"rewards/accuracy_reward/std": 0.3600577116012573,
"rewards/format_reward/mean": 0.015625,
"rewards/format_reward/std": 0.12426253408193588,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1804.0,
"completions/mean_length": 512.2109375,
"completions/mean_terminated_length": 506.1882629394531,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.07255268071485729,
"grad_norm": 0.2719551051491964,
"learning_rate": 6.666666666666666e-07,
"loss": -0.0057,
"num_tokens": 2796384.0,
"reward": 0.12890625,
"reward_std": 0.2379668951034546,
"rewards/accuracy_reward/mean": 0.125,
"rewards/accuracy_reward/std": 0.33136674761772156,
"rewards/format_reward/mean": 0.0078125,
"rewards/format_reward/std": 0.08821486681699753,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 531.515625,
"completions/mean_terminated_length": 507.4444885253906,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.07682048546279008,
"grad_norm": 0.29037656472769297,
"learning_rate": 7.083333333333334e-07,
"loss": -0.0302,
"num_tokens": 2978284.0,
"reward": 0.138671875,
"reward_std": 0.25321292877197266,
"rewards/accuracy_reward/mean": 0.13671875,
"rewards/accuracy_reward/std": 0.34422317147254944,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1774.0,
"completions/mean_length": 506.19921875,
"completions/mean_terminated_length": 494.0590515136719,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.08108829021072286,
"grad_norm": 0.2963880732497132,
"learning_rate": 7.5e-07,
"loss": -0.0176,
"num_tokens": 3147335.0,
"reward": 0.171875,
"reward_std": 0.2536035180091858,
"rewards/accuracy_reward/mean": 0.16796875,
"rewards/accuracy_reward/std": 0.3745708465576172,
"rewards/format_reward/mean": 0.0078125,
"rewards/format_reward/std": 0.08821486681699753,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1865.0,
"completions/mean_length": 622.21875,
"completions/mean_terminated_length": 582.1365356445312,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"epoch": 0.08535609495865565,
"grad_norm": 0.23734206503775232,
"learning_rate": 7.916666666666666e-07,
"loss": -0.0059,
"num_tokens": 3345599.0,
"reward": 0.15625,
"reward_std": 0.23474395275115967,
"rewards/accuracy_reward/mean": 0.15625,
"rewards/accuracy_reward/std": 0.3638034462928772,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.0,
"completions/mean_length": 569.54296875,
"completions/mean_terminated_length": 557.9015502929688,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.08962389970658842,
"grad_norm": 0.2636621963543347,
"learning_rate": 8.333333333333333e-07,
"loss": -0.0261,
"num_tokens": 3539034.0,
"reward": 0.16796875,
"reward_std": 0.2069915533065796,
"rewards/accuracy_reward/mean": 0.16796875,
"rewards/accuracy_reward/std": 0.3745708465576172,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1934.0,
"completions/mean_length": 557.5078125,
"completions/mean_terminated_length": 545.7716674804688,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"epoch": 0.0938917044545212,
"grad_norm": 0.3118436045433341,
"learning_rate": 8.75e-07,
"loss": -0.0542,
"num_tokens": 3716140.0,
"reward": 0.197265625,
"reward_std": 0.27905112504959106,
"rewards/accuracy_reward/mean": 0.1953125,
"rewards/accuracy_reward/std": 0.39721766114234924,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1582.0,
"completions/mean_length": 566.75390625,
"completions/mean_terminated_length": 537.2470092773438,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.09815950920245399,
"grad_norm": 0.30754531099916416,
"learning_rate": 9.166666666666665e-07,
"loss": -0.0516,
"num_tokens": 3901429.0,
"reward": 0.154296875,
"reward_std": 0.24952469766139984,
"rewards/accuracy_reward/mean": 0.15234375,
"rewards/accuracy_reward/std": 0.3600577116012573,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 519.75390625,
"completions/mean_terminated_length": 513.7608032226562,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.10242731395038678,
"grad_norm": 0.32388668226405115,
"learning_rate": 9.583333333333334e-07,
"loss": -0.0207,
"num_tokens": 4071686.0,
"reward": 0.1796875,
"reward_std": 0.30168429017066956,
"rewards/accuracy_reward/mean": 0.1796875,
"rewards/accuracy_reward/std": 0.38467901945114136,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1993.0,
"completions/mean_length": 632.21875,
"completions/mean_terminated_length": 586.54833984375,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"epoch": 0.10669511869831955,
"grad_norm": 0.28009202798891364,
"learning_rate": 1e-06,
"loss": 0.0113,
"num_tokens": 4272478.0,
"reward": 0.173828125,
"reward_std": 0.2615548372268677,
"rewards/accuracy_reward/mean": 0.171875,
"rewards/accuracy_reward/std": 0.3780108094215393,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1792.0,
"completions/mean_length": 548.80859375,
"completions/mean_terminated_length": 531.0316162109375,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.11096292344625233,
"grad_norm": 0.3052060877640385,
"learning_rate": 9.999440509051367e-07,
"loss": 0.0023,
"num_tokens": 4445901.0,
"reward": 0.3359375,
"reward_std": 0.34599077701568604,
"rewards/accuracy_reward/mean": 0.3359375,
"rewards/accuracy_reward/std": 0.4732423722743988,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 547.10546875,
"completions/mean_terminated_length": 535.2874145507812,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.11523072819418512,
"grad_norm": 0.35684106787185593,
"learning_rate": 9.997762161417517e-07,
"loss": 0.0009,
"num_tokens": 4622760.0,
"reward": 0.4296875,
"reward_std": 0.4141198396682739,
"rewards/accuracy_reward/mean": 0.42578125,
"rewards/accuracy_reward/std": 0.49542948603630066,
"rewards/format_reward/mean": 0.0078125,
"rewards/format_reward/std": 0.08821486681699753,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1728.0,
"completions/mean_length": 582.15234375,
"completions/mean_terminated_length": 570.6102294921875,
"completions/min_length": 10.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1194985329421179,
"grad_norm": 0.314364820870329,
"learning_rate": 9.994965332706572e-07,
"loss": -0.0559,
"num_tokens": 4807967.0,
"reward": 0.396484375,
"reward_std": 0.369541734457016,
"rewards/accuracy_reward/mean": 0.39453125,
"rewards/accuracy_reward/std": 0.48970720171928406,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.0,
"completions/mean_length": 614.94921875,
"completions/mean_terminated_length": 592.202392578125,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.12376633769005067,
"grad_norm": 47.01468065661083,
"learning_rate": 9.991050648838675e-07,
"loss": -0.049,
"num_tokens": 5002658.0,
"reward": 0.341796875,
"reward_std": 0.33887720108032227,
"rewards/accuracy_reward/mean": 0.33203125,
"rewards/accuracy_reward/std": 0.4718646705150604,
"rewards/format_reward/mean": 0.01953125,
"rewards/format_reward/std": 0.13865381479263306,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1812.0,
"completions/mean_length": 749.625,
"completions/mean_terminated_length": 729.0159301757812,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.12803414243798347,
"grad_norm": 0.22791664567073833,
"learning_rate": 9.986018985905899e-07,
"loss": -0.0017,
"num_tokens": 5230594.0,
"reward": 0.24609375,
"reward_std": 0.2706603407859802,
"rewards/accuracy_reward/mean": 0.24609375,
"rewards/accuracy_reward/std": 0.43157756328582764,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 657.0546875,
"completions/mean_terminated_length": 640.561279296875,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"epoch": 0.13230194718591626,
"grad_norm": 0.2592014737956619,
"learning_rate": 9.979871469976195e-07,
"loss": -0.0089,
"num_tokens": 5429960.0,
"reward": 0.390625,
"reward_std": 0.35836219787597656,
"rewards/accuracy_reward/mean": 0.390625,
"rewards/accuracy_reward/std": 0.48884621262550354,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1834.0,
"completions/mean_length": 641.6796875,
"completions/mean_terminated_length": 625.0039672851562,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.13656975193384902,
"grad_norm": 0.29062097226457245,
"learning_rate": 9.972609476841365e-07,
"loss": -0.0154,
"num_tokens": 5633382.0,
"reward": 0.43359375,
"reward_std": 0.35441678762435913,
"rewards/accuracy_reward/mean": 0.43359375,
"rewards/accuracy_reward/std": 0.4965413510799408,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1923.0,
"completions/mean_length": 752.05078125,
"completions/mean_terminated_length": 720.9480590820312,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.1408375566817818,
"grad_norm": 0.2223733295468163,
"learning_rate": 9.964234631709185e-07,
"loss": -0.0186,
"num_tokens": 5865835.0,
"reward": 0.33203125,
"reward_std": 0.3237749934196472,
"rewards/accuracy_reward/mean": 0.33203125,
"rewards/accuracy_reward/std": 0.4718646705150604,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1993.0,
"completions/mean_length": 642.63671875,
"completions/mean_terminated_length": 631.5708618164062,
"completions/min_length": 117.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.14510536142971459,
"grad_norm": 0.28031006546472104,
"learning_rate": 9.954748808839674e-07,
"loss": 0.0198,
"num_tokens": 6069782.0,
"reward": 0.44140625,
"reward_std": 0.34704914689064026,
"rewards/accuracy_reward/mean": 0.44140625,
"rewards/accuracy_reward/std": 0.4975275993347168,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1777.0,
"completions/mean_length": 678.16796875,
"completions/mean_terminated_length": 656.4246215820312,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.14937316617764737,
"grad_norm": 0.2753300220077088,
"learning_rate": 9.944154131125642e-07,
"loss": 0.0057,
"num_tokens": 6277537.0,
"reward": 0.435546875,
"reward_std": 0.31180548667907715,
"rewards/accuracy_reward/mean": 0.43359375,
"rewards/accuracy_reward/std": 0.4965413510799408,
"rewards/format_reward/mean": 0.00390625,
"rewards/format_reward/std": 0.0625,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1879.0,
"completions/mean_length": 615.171875,
"completions/mean_terminated_length": 592.4285888671875,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.15364097092558016,
"grad_norm": 0.2519379051726046,
"learning_rate": 9.932452969617607e-07,
"loss": 0.0259,
"num_tokens": 6474301.0,
"reward": 0.51953125,
"reward_std": 0.3044525980949402,
"rewards/accuracy_reward/mean": 0.51953125,
"rewards/accuracy_reward/std": 0.5005971193313599,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1755.0,
"completions/mean_length": 697.0859375,
"completions/mean_terminated_length": 686.4487915039062,
"completions/min_length": 130.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.15790877567351294,
"grad_norm": 0.23690228215779296,
"learning_rate": 9.919647942993147e-07,
"loss": 0.0077,
"num_tokens": 6685995.0,
"reward": 0.48046875,
"reward_std": 0.2803860306739807,
"rewards/accuracy_reward/mean": 0.48046875,
"rewards/accuracy_reward/std": 0.5005971193313599,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 761.08203125,
"completions/mean_terminated_length": 724.903564453125,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.16217658042144573,
"grad_norm": 0.26189309931903004,
"learning_rate": 9.905741916970863e-07,
"loss": 0.0554,
"num_tokens": 6917440.0,
"reward": 0.5703125,
"reward_std": 0.24211551249027252,
"rewards/accuracy_reward/mean": 0.5703125,
"rewards/accuracy_reward/std": 0.4960011839866638,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 718.17578125,
"completions/mean_terminated_length": 686.260009765625,
"completions/min_length": 8.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.1664443851693785,
"grad_norm": 0.22433030609891538,
"learning_rate": 9.890738003669027e-07,
"loss": 0.0011,
"num_tokens": 7140173.0,
"reward": 0.421875,
"reward_std": 0.3233967423439026,
"rewards/accuracy_reward/mean": 0.421875,
"rewards/accuracy_reward/std": 0.49482619762420654,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1640.0,
"completions/mean_length": 707.8203125,
"completions/mean_terminated_length": 664.5886840820312,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.1707121899173113,
"grad_norm": 0.24142502490016315,
"learning_rate": 9.874639560909118e-07,
"loss": 0.0948,
"num_tokens": 7360927.0,
"reward": 0.578125,
"reward_std": 0.25620076060295105,
"rewards/accuracy_reward/mean": 0.578125,
"rewards/accuracy_reward/std": 0.49482619762420654,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 667.9375,
"completions/mean_terminated_length": 617.65185546875,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.17497999466524405,
"grad_norm": 0.23034233566289028,
"learning_rate": 9.857450191464337e-07,
"loss": 0.0705,
"num_tokens": 7565055.0,
"reward": 0.56640625,
"reward_std": 0.29234567284584045,
"rewards/accuracy_reward/mean": 0.56640625,
"rewards/accuracy_reward/std": 0.4965413510799408,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1835.0,
"completions/mean_length": 808.91015625,
"completions/mean_terminated_length": 763.7611694335938,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.17924779941317684,
"grad_norm": 0.15288104504565547,
"learning_rate": 9.839173742253334e-07,
"loss": 0.0067,
"num_tokens": 7812320.0,
"reward": 0.453125,
"reward_std": 0.2123872935771942,
"rewards/accuracy_reward/mean": 0.453125,
"rewards/accuracy_reward/std": 0.4987730085849762,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1916.0,
"completions/mean_length": 757.765625,
"completions/mean_terminated_length": 710.7530517578125,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.18351560416110962,
"grad_norm": 0.21817363062261033,
"learning_rate": 9.819814303479267e-07,
"loss": 0.0301,
"num_tokens": 8044124.0,
"reward": 0.51953125,
"reward_std": 0.2767027020454407,
"rewards/accuracy_reward/mean": 0.51953125,
"rewards/accuracy_reward/std": 0.5005971193313599,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1877.0,
"completions/mean_length": 696.4609375,
"completions/mean_terminated_length": 618.272705078125,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.1877834089090424,
"grad_norm": 0.23513888129908564,
"learning_rate": 9.799376207714444e-07,
"loss": 0.1136,
"num_tokens": 8258978.0,
"reward": 0.64453125,
"reward_std": 0.2685386538505554,
"rewards/accuracy_reward/mean": 0.64453125,
"rewards/accuracy_reward/std": 0.4795927405357361,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1979.0,
"completions/mean_length": 823.21484375,
"completions/mean_terminated_length": 783.7056274414062,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.1920512136569752,
"grad_norm": 0.15675122066313374,
"learning_rate": 9.777864028930705e-07,
"loss": 0.0199,
"num_tokens": 8509073.0,
"reward": 0.44921875,
"reward_std": 0.247502401471138,
"rewards/accuracy_reward/mean": 0.44921875,
"rewards/accuracy_reward/std": 0.49838894605636597,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1888.0,
"completions/mean_length": 788.9765625,
"completions/mean_terminated_length": 748.3628540039062,
"completions/min_length": 245.0,
"completions/min_terminated_length": 245.0,
"epoch": 0.19631901840490798,
"grad_norm": 0.20454769015675417,
"learning_rate": 9.755282581475767e-07,
"loss": 0.038,
"num_tokens": 8753347.0,
"reward": 0.59765625,
"reward_std": 0.32180553674697876,
"rewards/accuracy_reward/mean": 0.59765625,
"rewards/accuracy_reward/std": 0.4913311004638672,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1943.0,
"completions/mean_length": 688.484375,
"completions/mean_terminated_length": 661.4024047851562,
"completions/min_length": 4.0,
"completions/min_terminated_length": 4.0,
"epoch": 0.20058682315284077,
"grad_norm": 0.2436406130056707,
"learning_rate": 9.73163691899582e-07,
"loss": 0.0891,
"num_tokens": 8961887.0,
"reward": 0.59765625,
"reward_std": 0.3268141746520996,
"rewards/accuracy_reward/mean": 0.59765625,
"rewards/accuracy_reward/std": 0.4913311004638672,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1836.0,
"completions/max_terminated_length": 1836.0,
"completions/mean_length": 636.9921875,
"completions/mean_terminated_length": 636.9921875,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.20485462790077355,
"grad_norm": 0.19700830452823495,
"learning_rate": 9.706932333304517e-07,
"loss": 0.0074,
"num_tokens": 9161397.0,
"reward": 0.5390625,
"reward_std": 0.20897233486175537,
"rewards/accuracy_reward/mean": 0.5390625,
"rewards/accuracy_reward/std": 0.4994482398033142,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1866.0,
"completions/mean_length": 730.828125,
"completions/mean_terminated_length": 720.4566650390625,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.2091224326487063,
"grad_norm": 0.1802054848104146,
"learning_rate": 9.681174353198686e-07,
"loss": 0.0045,
"num_tokens": 9386945.0,
"reward": 0.56640625,
"reward_std": 0.25209930539131165,
"rewards/accuracy_reward/mean": 0.56640625,
"rewards/accuracy_reward/std": 0.4965413510799408,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 756.77734375,
"completions/mean_terminated_length": 704.28857421875,
"completions/min_length": 208.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.2133902373966391,
"grad_norm": 0.20316168234356663,
"learning_rate": 9.65436874322102e-07,
"loss": 0.0501,
"num_tokens": 9618008.0,
"reward": 0.53125,
"reward_std": 0.2852449417114258,
"rewards/accuracy_reward/mean": 0.53125,
"rewards/accuracy_reward/std": 0.5,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1880.0,
"completions/mean_length": 612.92578125,
"completions/mean_terminated_length": 595.9091186523438,
"completions/min_length": 9.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.21765804214457188,
"grad_norm": 0.48416880632938786,
"learning_rate": 9.626521502369983e-07,
"loss": 0.0519,
"num_tokens": 9805733.0,
"reward": 0.6953125,
"reward_std": 0.28801077604293823,
"rewards/accuracy_reward/mean": 0.6953125,
"rewards/accuracy_reward/std": 0.4611765742301941,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1925.0,
"completions/mean_length": 789.7734375,
"completions/mean_terminated_length": 749.1854858398438,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.22192584689250466,
"grad_norm": 0.19757693281422897,
"learning_rate": 9.597638862757253e-07,
"loss": 0.0277,
"num_tokens": 10046019.0,
"reward": 0.48828125,
"reward_std": 0.18937908113002777,
"rewards/accuracy_reward/mean": 0.48828125,
"rewards/accuracy_reward/std": 0.5008418560028076,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 758.91015625,
"completions/mean_terminated_length": 717.3265991210938,
"completions/min_length": 80.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.22619365164043745,
"grad_norm": 0.2070702495543546,
"learning_rate": 9.567727288213004e-07,
"loss": 0.0079,
"num_tokens": 10275228.0,
"reward": 0.578125,
"reward_std": 0.29420530796051025,
"rewards/accuracy_reward/mean": 0.578125,
"rewards/accuracy_reward/std": 0.49482619762420654,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 682.43359375,
"completions/mean_terminated_length": 638.383056640625,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.23046145638837023,
"grad_norm": 0.20865981520254198,
"learning_rate": 9.536793472839324e-07,
"loss": 0.0807,
"num_tokens": 10485083.0,
"reward": 0.66015625,
"reward_std": 0.2412043958902359,
"rewards/accuracy_reward/mean": 0.66015625,
"rewards/accuracy_reward/std": 0.47458380460739136,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1971.0,
"completions/mean_length": 789.47265625,
"completions/mean_terminated_length": 764.4024047851562,
"completions/min_length": 150.0,
"completions/min_terminated_length": 150.0,
"epoch": 0.23472926113630302,
"grad_norm": 0.24817725107971106,
"learning_rate": 9.504844339512094e-07,
"loss": 0.0141,
"num_tokens": 10725292.0,
"reward": 0.4765625,
"reward_std": 0.22461043298244476,
"rewards/accuracy_reward/mean": 0.4765625,
"rewards/accuracy_reward/std": 0.5004287362098694,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1944.0,
"completions/mean_length": 790.01171875,
"completions/mean_terminated_length": 738.8739624023438,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.2389970658842358,
"grad_norm": 0.1380585420387336,
"learning_rate": 9.471887038331684e-07,
"loss": 0.0103,
"num_tokens": 10969431.0,
"reward": 0.5078125,
"reward_std": 0.21884137392044067,
"rewards/accuracy_reward/mean": 0.5078125,
"rewards/accuracy_reward/std": 0.5009182691574097,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 777.04296875,
"completions/mean_terminated_length": 746.5400390625,
"completions/min_length": 5.0,
"completions/min_terminated_length": 5.0,
"epoch": 0.2432648706321686,
"grad_norm": 0.17843550897472416,
"learning_rate": 9.43792894502277e-07,
"loss": 0.0051,
"num_tokens": 11202554.0,
"reward": 0.5,
"reward_std": 0.28064805269241333,
"rewards/accuracy_reward/mean": 0.5,
"rewards/accuracy_reward/std": 0.5009794235229492,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1974.0,
"completions/mean_length": 738.20703125,
"completions/mean_terminated_length": 701.385498046875,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.24753267538010135,
"grad_norm": 0.19995757252218357,
"learning_rate": 9.402977659283689e-07,
"loss": 0.0315,
"num_tokens": 11427103.0,
"reward": 0.5078125,
"reward_std": 0.24606332182884216,
"rewards/accuracy_reward/mean": 0.5078125,
"rewards/accuracy_reward/std": 0.5009182691574097,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1812.0,
"completions/mean_length": 704.59375,
"completions/mean_terminated_length": 672.35205078125,
"completions/min_length": 226.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.25180048012803413,
"grad_norm": 0.20212521010131737,
"learning_rate": 9.367041003085648e-07,
"loss": 0.005,
"num_tokens": 11646887.0,
"reward": 0.6015625,
"reward_std": 0.29538238048553467,
"rewards/accuracy_reward/mean": 0.6015625,
"rewards/accuracy_reward/std": 0.4905354380607605,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1927.0,
"completions/mean_length": 722.8828125,
"completions/mean_terminated_length": 674.5991821289062,
"completions/min_length": 227.0,
"completions/min_terminated_length": 227.0,
"epoch": 0.25606828487596694,
"grad_norm": 0.1889634470793339,
"learning_rate": 9.330127018922193e-07,
"loss": 0.0282,
"num_tokens": 11871153.0,
"reward": 0.51953125,
"reward_std": 0.26025843620300293,
"rewards/accuracy_reward/mean": 0.51953125,
"rewards/accuracy_reward/std": 0.5005971193313599,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2041.0,
"completions/mean_length": 688.640625,
"completions/mean_terminated_length": 661.561767578125,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.2603360896238997,
"grad_norm": 0.16527044482740116,
"learning_rate": 9.29224396800933e-07,
"loss": 0.0011,
"num_tokens": 12085317.0,
"reward": 0.59765625,
"reward_std": 0.24302664399147034,
"rewards/accuracy_reward/mean": 0.59765625,
"rewards/accuracy_reward/std": 0.4913311004638672,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.0,
"completions/mean_length": 739.89453125,
"completions/mean_terminated_length": 703.1204833984375,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.2646038943718325,
"grad_norm": 0.1817045096138573,
"learning_rate": 9.253400328436698e-07,
"loss": 0.0054,
"num_tokens": 12313746.0,
"reward": 0.5625,
"reward_std": 0.28380340337753296,
"rewards/accuracy_reward/mean": 0.5625,
"rewards/accuracy_reward/std": 0.49705013632774353,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 701.35546875,
"completions/mean_terminated_length": 679.980224609375,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.2688716991197653,
"grad_norm": 0.1839180488541761,
"learning_rate": 9.213604793270196e-07,
"loss": 0.0088,
"num_tokens": 12524053.0,
"reward": 0.6015625,
"reward_std": 0.2809164524078369,
"rewards/accuracy_reward/mean": 0.6015625,
"rewards/accuracy_reward/std": 0.4905354380607605,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 719.71484375,
"completions/mean_terminated_length": 693.2550048828125,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"epoch": 0.27313950386769803,
"grad_norm": 0.1587390985090563,
"learning_rate": 9.172866268606513e-07,
"loss": 0.0182,
"num_tokens": 12743868.0,
"reward": 0.578125,
"reward_std": 0.17833054065704346,
"rewards/accuracy_reward/mean": 0.578125,
"rewards/accuracy_reward/std": 0.49482619762420654,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1978.0,
"completions/mean_length": 676.328125,
"completions/mean_terminated_length": 649.0040283203125,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.27740730861563084,
"grad_norm": 0.16100932533778303,
"learning_rate": 9.131193871579974e-07,
"loss": 0.0195,
"num_tokens": 12951496.0,
"reward": 0.6640625,
"reward_std": 0.2225247025489807,
"rewards/accuracy_reward/mean": 0.6640625,
"rewards/accuracy_reward/std": 0.4732423722743988,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2030.0,
"completions/mean_length": 701.2421875,
"completions/mean_terminated_length": 674.4143676757812,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.2816751133635636,
"grad_norm": 0.46716467212075274,
"learning_rate": 9.088596928322157e-07,
"loss": 0.0134,
"num_tokens": 13165550.0,
"reward": 0.62109375,
"reward_std": 0.21109546720981598,
"rewards/accuracy_reward/mean": 0.62109375,
"rewards/accuracy_reward/std": 0.4860650300979614,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 842.6328125,
"completions/mean_terminated_length": 793.6340942382812,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.2859429181114964,
"grad_norm": 0.16388899852140715,
"learning_rate": 9.045084971874737e-07,
"loss": 0.0186,
"num_tokens": 13415840.0,
"reward": 0.55859375,
"reward_std": 0.29036736488342285,
"rewards/accuracy_reward/mean": 0.55859375,
"rewards/accuracy_reward/std": 0.4975275993347168,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1669.0,
"completions/mean_length": 677.08984375,
"completions/mean_terminated_length": 660.8340454101562,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.29021072285942917,
"grad_norm": 0.2061346542221812,
"learning_rate": 9.000667740056032e-07,
"loss": 0.0402,
"num_tokens": 13622999.0,
"reward": 0.671875,
"reward_std": 0.30510413646698,
"rewards/accuracy_reward/mean": 0.671875,
"rewards/accuracy_reward/std": 0.47045037150382996,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 772.22265625,
"completions/mean_terminated_length": 751.9722900390625,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"epoch": 0.294478527607362,
"grad_norm": 0.16099414892174813,
"learning_rate": 8.955355173281707e-07,
"loss": 0.0091,
"num_tokens": 13854584.0,
"reward": 0.53125,
"reward_std": 0.21226614713668823,
"rewards/accuracy_reward/mean": 0.53125,
"rewards/accuracy_reward/std": 0.5,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1719.0,
"completions/mean_length": 664.7421875,
"completions/mean_terminated_length": 637.187255859375,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.29874633235529474,
"grad_norm": 0.17624371113200196,
"learning_rate": 8.909157412340149e-07,
"loss": 0.0376,
"num_tokens": 14064302.0,
"reward": 0.59375,
"reward_std": 0.17833054065704346,
"rewards/accuracy_reward/mean": 0.59375,
"rewards/accuracy_reward/std": 0.49209436774253845,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1644.0,
"completions/mean_length": 652.93359375,
"completions/mean_terminated_length": 647.4627685546875,
"completions/min_length": 85.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.30301413710322755,
"grad_norm": 0.16045190404205056,
"learning_rate": 8.862084796122997e-07,
"loss": 0.0079,
"num_tokens": 14270813.0,
"reward": 0.62109375,
"reward_std": 0.2211979478597641,
"rewards/accuracy_reward/mean": 0.62109375,
"rewards/accuracy_reward/std": 0.4860650300979614,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 688.3203125,
"completions/mean_terminated_length": 650.0963745117188,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.3072819418511603,
"grad_norm": 0.1887414611146376,
"learning_rate": 8.814147859311332e-07,
"loss": 0.0354,
"num_tokens": 14480903.0,
"reward": 0.65625,
"reward_std": 0.20672954618930817,
"rewards/accuracy_reward/mean": 0.65625,
"rewards/accuracy_reward/std": 0.47588926553726196,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.0,
"completions/mean_length": 682.43359375,
"completions/mean_terminated_length": 644.0441284179688,
"completions/min_length": 242.0,
"completions/min_terminated_length": 242.0,
"epoch": 0.31154974659909307,
"grad_norm": 0.1906200755581348,
"learning_rate": 8.765357330018055e-07,
"loss": 0.0125,
"num_tokens": 14692814.0,
"reward": 0.59765625,
"reward_std": 0.27275487780570984,
"rewards/accuracy_reward/mean": 0.59765625,
"rewards/accuracy_reward/std": 0.4913311004638672,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1928.0,
"completions/mean_length": 807.25390625,
"completions/mean_terminated_length": 772.3734741210938,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.3158175513470259,
"grad_norm": 0.1883899046813179,
"learning_rate": 8.71572412738697e-07,
"loss": 0.0295,
"num_tokens": 14933559.0,
"reward": 0.5703125,
"reward_std": 0.28209349513053894,
"rewards/accuracy_reward/mean": 0.5703125,
"rewards/accuracy_reward/std": 0.4960011839866638,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 674.0078125,
"completions/mean_terminated_length": 641.0320434570312,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"epoch": 0.32008535609495864,
"grad_norm": 0.21767892682202633,
"learning_rate": 8.66525935914913e-07,
"loss": 0.0231,
"num_tokens": 15138305.0,
"reward": 0.609375,
"reward_std": 0.2489503175020218,
"rewards/accuracy_reward/mean": 0.609375,
"rewards/accuracy_reward/std": 0.48884621262550354,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2001.0,
"completions/mean_length": 650.12890625,
"completions/mean_terminated_length": 622.2828979492188,
"completions/min_length": 155.0,
"completions/min_terminated_length": 155.0,
"epoch": 0.32435316084289145,
"grad_norm": 0.19564087736557972,
"learning_rate": 8.613974319136957e-07,
"loss": 0.0185,
"num_tokens": 15338842.0,
"reward": 0.6640625,
"reward_std": 0.22423464059829712,
"rewards/accuracy_reward/mean": 0.6640625,
"rewards/accuracy_reward/std": 0.4732423722743988,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1601.0,
"completions/mean_length": 730.609375,
"completions/mean_terminated_length": 677.056884765625,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.3286209655908242,
"grad_norm": 0.17202197261802565,
"learning_rate": 8.561880484756724e-07,
"loss": 0.0369,
"num_tokens": 15566942.0,
"reward": 0.5078125,
"reward_std": 0.2588193416595459,
"rewards/accuracy_reward/mean": 0.5078125,
"rewards/accuracy_reward/std": 0.5009182691574097,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 748.05078125,
"completions/mean_terminated_length": 695.207275390625,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.332888770338757,
"grad_norm": 0.19789651992275556,
"learning_rate": 8.508989514419958e-07,
"loss": 0.0341,
"num_tokens": 15796491.0,
"reward": 0.5546875,
"reward_std": 0.29169410467147827,
"rewards/accuracy_reward/mean": 0.5546875,
"rewards/accuracy_reward/std": 0.49797385931015015,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1970.0,
"completions/mean_length": 781.20703125,
"completions/mean_terminated_length": 766.185791015625,
"completions/min_length": 280.0,
"completions/min_terminated_length": 280.0,
"epoch": 0.3371565750866898,
"grad_norm": 0.18106530098095389,
"learning_rate": 8.455313244934324e-07,
"loss": 0.002,
"num_tokens": 16034232.0,
"reward": 0.55859375,
"reward_std": 0.2968214750289917,
"rewards/accuracy_reward/mean": 0.55859375,
"rewards/accuracy_reward/std": 0.4975275993347168,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1901.0,
"completions/mean_length": 592.90625,
"completions/mean_terminated_length": 569.8095703125,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.3414243798346226,
"grad_norm": 0.19156344376777484,
"learning_rate": 8.400863688854596e-07,
"loss": 0.0214,
"num_tokens": 16218288.0,
"reward": 0.71875,
"reward_std": 0.2315973937511444,
"rewards/accuracy_reward/mean": 0.71875,
"rewards/accuracy_reward/std": 0.45048993825912476,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2038.0,
"completions/mean_length": 712.8125,
"completions/mean_terminated_length": 702.2991943359375,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.34569218458255535,
"grad_norm": 0.16774458832209882,
"learning_rate": 8.34565303179429e-07,
"loss": 0.008,
"num_tokens": 16433688.0,
"reward": 0.734375,
"reward_std": 0.21397608518600464,
"rewards/accuracy_reward/mean": 0.734375,
"rewards/accuracy_reward/std": 0.4425306022167206,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1999.0,
"completions/mean_length": 732.46484375,
"completions/mean_terminated_length": 706.2589721679688,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.3499599893304881,
"grad_norm": 0.1747412031038752,
"learning_rate": 8.289693629698563e-07,
"loss": 0.0117,
"num_tokens": 16654767.0,
"reward": 0.6328125,
"reward_std": 0.25316160917282104,
"rewards/accuracy_reward/mean": 0.6328125,
"rewards/accuracy_reward/std": 0.48298248648643494,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1858.0,
"completions/mean_length": 788.609375,
"completions/mean_terminated_length": 753.2047729492188,
"completions/min_length": 282.0,
"completions/min_terminated_length": 282.0,
"epoch": 0.3542277940784209,
"grad_norm": 0.15522696483921386,
"learning_rate": 8.232998006078997e-07,
"loss": 0.0202,
"num_tokens": 16894779.0,
"reward": 0.55078125,
"reward_std": 0.22396378219127655,
"rewards/accuracy_reward/mean": 0.55078125,
"rewards/accuracy_reward/std": 0.49838894605636597,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1696.0,
"completions/max_terminated_length": 1696.0,
"completions/mean_length": 616.984375,
"completions/mean_terminated_length": 616.984375,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.3584955988263537,
"grad_norm": 0.17841302636273246,
"learning_rate": 8.175578849210894e-07,
"loss": 0.0247,
"num_tokens": 17084591.0,
"reward": 0.69140625,
"reward_std": 0.22423216700553894,
"rewards/accuracy_reward/mean": 0.69140625,
"rewards/accuracy_reward/std": 0.46281787753105164,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2017.0,
"completions/mean_length": 627.47265625,
"completions/mean_terminated_length": 610.6284790039062,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.3627634035742865,
"grad_norm": 0.17841638110639277,
"learning_rate": 8.117449009293668e-07,
"loss": 0.0174,
"num_tokens": 17278168.0,
"reward": 0.6328125,
"reward_std": 0.2315973937511444,
"rewards/accuracy_reward/mean": 0.6328125,
"rewards/accuracy_reward/std": 0.48298248648643494,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1687.0,
"completions/mean_length": 642.0859375,
"completions/mean_terminated_length": 619.7698974609375,
"completions/min_length": 211.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.36703120832221925,
"grad_norm": 0.16779939521060505,
"learning_rate": 8.058621495575031e-07,
"loss": 0.0275,
"num_tokens": 17482222.0,
"reward": 0.69921875,
"reward_std": 0.229889914393425,
"rewards/accuracy_reward/mean": 0.69921875,
"rewards/accuracy_reward/std": 0.45949608087539673,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1946.0,
"completions/mean_length": 696.1015625,
"completions/mean_terminated_length": 680.0711669921875,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.37129901307015206,
"grad_norm": 0.15547052401529285,
"learning_rate": 7.999109473439569e-07,
"loss": -0.0119,
"num_tokens": 17699168.0,
"reward": 0.57421875,
"reward_std": 0.22594210505485535,
"rewards/accuracy_reward/mean": 0.57421875,
"rewards/accuracy_reward/std": 0.49542948603630066,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1843.0,
"completions/mean_length": 672.66796875,
"completions/mean_terminated_length": 634.0039672851562,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.3755668178180848,
"grad_norm": 0.2075832015926432,
"learning_rate": 7.938926261462365e-07,
"loss": 0.0422,
"num_tokens": 17906067.0,
"reward": 0.6640625,
"reward_std": 0.2511882185935974,
"rewards/accuracy_reward/mean": 0.6640625,
"rewards/accuracy_reward/std": 0.4732423722743988,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1788.0,
"completions/mean_length": 706.59375,
"completions/mean_terminated_length": 674.4000244140625,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"epoch": 0.37983462256601763,
"grad_norm": 0.17450668008186176,
"learning_rate": 7.878085328428368e-07,
"loss": 0.0417,
"num_tokens": 18123947.0,
"reward": 0.70703125,
"reward_std": 0.20581842958927155,
"rewards/accuracy_reward/mean": 0.70703125,
"rewards/accuracy_reward/std": 0.45601576566696167,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1865.0,
"completions/max_terminated_length": 1865.0,
"completions/mean_length": 615.2890625,
"completions/mean_terminated_length": 615.2890625,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.3841024273139504,
"grad_norm": 0.29433828050165617,
"learning_rate": 7.81660029031811e-07,
"loss": 0.0251,
"num_tokens": 18313293.0,
"reward": 0.703125,
"reward_std": 0.2618584930896759,
"rewards/accuracy_reward/mean": 0.703125,
"rewards/accuracy_reward/std": 0.45777595043182373,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1871.0,
"completions/mean_length": 674.09375,
"completions/mean_terminated_length": 641.1200561523438,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.38837023206188315,
"grad_norm": 0.1524263582318208,
"learning_rate": 7.754484907260512e-07,
"loss": 0.0346,
"num_tokens": 18520901.0,
"reward": 0.671875,
"reward_std": 0.14203590154647827,
"rewards/accuracy_reward/mean": 0.671875,
"rewards/accuracy_reward/std": 0.47045037150382996,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 868.55859375,
"completions/mean_terminated_length": 825.5830078125,
"completions/min_length": 254.0,
"completions/min_terminated_length": 254.0,
"epoch": 0.39263803680981596,
"grad_norm": 0.16545970246243014,
"learning_rate": 7.691753080453411e-07,
"loss": 0.0313,
"num_tokens": 18778724.0,
"reward": 0.484375,
"reward_std": 0.25209686160087585,
"rewards/accuracy_reward/mean": 0.484375,
"rewards/accuracy_reward/std": 0.5007347464561462,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1778.0,
"completions/mean_length": 677.42578125,
"completions/mean_terminated_length": 672.051025390625,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"epoch": 0.3969058415577487,
"grad_norm": 0.2318783127386051,
"learning_rate": 7.628418849052523e-07,
"loss": 0.0072,
"num_tokens": 18993529.0,
"reward": 0.62109375,
"reward_std": 0.2673616111278534,
"rewards/accuracy_reward/mean": 0.62109375,
"rewards/accuracy_reward/std": 0.4860650300979614,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1946.0,
"completions/mean_length": 761.3125,
"completions/mean_terminated_length": 756.2667236328125,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.40117364630568153,
"grad_norm": 0.17729523898962476,
"learning_rate": 7.564496387029531e-07,
"loss": 0.0183,
"num_tokens": 19224033.0,
"reward": 0.5390625,
"reward_std": 0.22370177507400513,
"rewards/accuracy_reward/mean": 0.5390625,
"rewards/accuracy_reward/std": 0.4994482398033142,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1737.0,
"completions/mean_length": 733.1484375,
"completions/mean_terminated_length": 706.9561767578125,
"completions/min_length": 201.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.4054414510536143,
"grad_norm": 0.17772837137802816,
"learning_rate": 7.5e-07,
"loss": 0.006,
"num_tokens": 19447919.0,
"reward": 0.5703125,
"reward_std": 0.22396133840084076,
"rewards/accuracy_reward/mean": 0.5703125,
"rewards/accuracy_reward/std": 0.4960011839866638,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1869.0,
"completions/mean_length": 630.4609375,
"completions/mean_terminated_length": 590.6104125976562,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.4097092558015471,
"grad_norm": 0.1765422148392705,
"learning_rate": 7.434944122021836e-07,
"loss": 0.0296,
"num_tokens": 19646709.0,
"reward": 0.67578125,
"reward_std": 0.2021351009607315,
"rewards/accuracy_reward/mean": 0.67578125,
"rewards/accuracy_reward/std": 0.46899911761283875,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1887.0,
"completions/mean_length": 709.61328125,
"completions/mean_terminated_length": 699.0748291015625,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.41397706054947986,
"grad_norm": 1.043131061244302,
"learning_rate": 7.369343312364993e-07,
"loss": 0.025,
"num_tokens": 19864034.0,
"reward": 0.66015625,
"reward_std": 0.19712254405021667,
"rewards/accuracy_reward/mean": 0.66015625,
"rewards/accuracy_reward/std": 0.47458380460739136,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1589.0,
"completions/mean_length": 584.66796875,
"completions/mean_terminated_length": 555.5179443359375,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.4182448652974126,
"grad_norm": 0.20960134937787364,
"learning_rate": 7.303212252253161e-07,
"loss": 0.0423,
"num_tokens": 20045565.0,
"reward": 0.7265625,
"reward_std": 0.1910865604877472,
"rewards/accuracy_reward/mean": 0.7265625,
"rewards/accuracy_reward/std": 0.446596622467041,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.0,
"completions/mean_length": 804.58203125,
"completions/mean_terminated_length": 759.2753295898438,
"completions/min_length": 215.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.42251267004534543,
"grad_norm": 0.18572317777841613,
"learning_rate": 7.236565741578162e-07,
"loss": 0.0184,
"num_tokens": 20290610.0,
"reward": 0.46484375,
"reward_std": 0.2952326536178589,
"rewards/accuracy_reward/mean": 0.46484375,
"rewards/accuracy_reward/std": 0.49973952770233154,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1751.0,
"completions/max_terminated_length": 1751.0,
"completions/mean_length": 713.328125,
"completions/mean_terminated_length": 713.328125,
"completions/min_length": 259.0,
"completions/min_terminated_length": 259.0,
"epoch": 0.4267804747932782,
"grad_norm": 0.21936154622420784,
"learning_rate": 7.16941869558779e-07,
"loss": 0.029,
"num_tokens": 20509950.0,
"reward": 0.62890625,
"reward_std": 0.30904948711395264,
"rewards/accuracy_reward/mean": 0.62890625,
"rewards/accuracy_reward/std": 0.48404383659362793,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1991.0,
"completions/mean_length": 722.703125,
"completions/mean_terminated_length": 717.5059204101562,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.431048279541211,
"grad_norm": 0.15759949457471728,
"learning_rate": 7.101786141547828e-07,
"loss": 0.0209,
"num_tokens": 20743090.0,
"reward": 0.6015625,
"reward_std": 0.2474999576807022,
"rewards/accuracy_reward/mean": 0.6015625,
"rewards/accuracy_reward/std": 0.4905354380607605,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 703.4140625,
"completions/mean_terminated_length": 660.040283203125,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.43531608428914376,
"grad_norm": 0.20631003127294775,
"learning_rate": 7.033683215379002e-07,
"loss": 0.0208,
"num_tokens": 20962836.0,
"reward": 0.61328125,
"reward_std": 0.24777080118656158,
"rewards/accuracy_reward/mean": 0.61328125,
"rewards/accuracy_reward/std": 0.4879522919654846,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1695.0,
"completions/mean_length": 676.51953125,
"completions/mean_terminated_length": 660.2569580078125,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.43958388903707657,
"grad_norm": 0.18277993766193618,
"learning_rate": 6.965125158269618e-07,
"loss": 0.0165,
"num_tokens": 21181369.0,
"reward": 0.6015625,
"reward_std": 0.21686306595802307,
"rewards/accuracy_reward/mean": 0.6015625,
"rewards/accuracy_reward/std": 0.4905354380607605,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1986.0,
"completions/mean_length": 749.9375,
"completions/mean_terminated_length": 734.5454711914062,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.4438516937850093,
"grad_norm": 0.14768169324356623,
"learning_rate": 6.896127313264642e-07,
"loss": 0.0037,
"num_tokens": 21407257.0,
"reward": 0.6328125,
"reward_std": 0.17267769575119019,
"rewards/accuracy_reward/mean": 0.6328125,
"rewards/accuracy_reward/std": 0.48298248648643494,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1753.0,
"completions/mean_length": 730.5625,
"completions/mean_terminated_length": 714.9407348632812,
"completions/min_length": 211.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.44811949853294214,
"grad_norm": 0.1714711197886987,
"learning_rate": 6.826705121831976e-07,
"loss": 0.0171,
"num_tokens": 21628641.0,
"reward": 0.6953125,
"reward_std": 0.191619411110878,
"rewards/accuracy_reward/mean": 0.6953125,
"rewards/accuracy_reward/std": 0.4611765742301941,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1905.0,
"completions/mean_length": 670.27734375,
"completions/mean_terminated_length": 648.4087524414062,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.4523873032808749,
"grad_norm": 0.17919870616850794,
"learning_rate": 6.756874120406714e-07,
"loss": 0.0125,
"num_tokens": 21841048.0,
"reward": 0.56640625,
"reward_std": 0.16978827118873596,
"rewards/accuracy_reward/mean": 0.56640625,
"rewards/accuracy_reward/std": 0.4965413510799408,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1974.0,
"completions/mean_length": 728.2578125,
"completions/mean_terminated_length": 707.3095703125,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.45665510802880765,
"grad_norm": 0.1451932728689885,
"learning_rate": 6.68664993691415e-07,
"loss": 0.0032,
"num_tokens": 22064930.0,
"reward": 0.65234375,
"reward_std": 0.17293481528759003,
"rewards/accuracy_reward/mean": 0.65234375,
"rewards/accuracy_reward/std": 0.4771590530872345,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 761.76953125,
"completions/mean_terminated_length": 720.2781982421875,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.46092291277674047,
"grad_norm": 0.15406274620167626,
"learning_rate": 6.6160482872723e-07,
"loss": 0.0406,
"num_tokens": 22291983.0,
"reward": 0.5703125,
"reward_std": 0.2566937804222107,
"rewards/accuracy_reward/mean": 0.5703125,
"rewards/accuracy_reward/std": 0.4960011839866638,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 666.41015625,
"completions/mean_terminated_length": 633.2520141601562,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.4651907175246732,
"grad_norm": 0.18595861629610597,
"learning_rate": 6.545084971874736e-07,
"loss": 0.0125,
"num_tokens": 22504056.0,
"reward": 0.5390625,
"reward_std": 0.2210792601108551,
"rewards/accuracy_reward/mean": 0.5390625,
"rewards/accuracy_reward/std": 0.4994482398033142,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1787.0,
"completions/mean_length": 599.96484375,
"completions/mean_terminated_length": 588.56298828125,
"completions/min_length": 176.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.46945852227260604,
"grad_norm": 0.16488321774504297,
"learning_rate": 6.473775872054521e-07,
"loss": 0.0011,
"num_tokens": 22688183.0,
"reward": 0.7265625,
"reward_std": 0.20582087337970734,
"rewards/accuracy_reward/mean": 0.7265625,
"rewards/accuracy_reward/std": 0.446596622467041,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1824.0,
"completions/mean_length": 661.24609375,
"completions/mean_terminated_length": 644.8023681640625,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.4737263270205388,
"grad_norm": 0.19850765821238375,
"learning_rate": 6.402136946530014e-07,
"loss": 0.018,
"num_tokens": 22888486.0,
"reward": 0.60546875,
"reward_std": 0.2784126102924347,
"rewards/accuracy_reward/mean": 0.60546875,
"rewards/accuracy_reward/std": 0.48970720171928406,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2046.0,
"completions/mean_length": 667.91015625,
"completions/mean_terminated_length": 629.1124267578125,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.4779941317684716,
"grad_norm": 0.20616587813684115,
"learning_rate": 6.330184227833375e-07,
"loss": 0.0412,
"num_tokens": 23098319.0,
"reward": 0.6953125,
"reward_std": 0.29314449429512024,
"rewards/accuracy_reward/mean": 0.6953125,
"rewards/accuracy_reward/std": 0.4611765742301941,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1913.0,
"completions/mean_length": 831.84765625,
"completions/mean_terminated_length": 802.6600341796875,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.48226193651640437,
"grad_norm": 0.17727191227994013,
"learning_rate": 6.257933818722542e-07,
"loss": 0.0011,
"num_tokens": 23354552.0,
"reward": 0.51953125,
"reward_std": 0.2792089581489563,
"rewards/accuracy_reward/mean": 0.51953125,
"rewards/accuracy_reward/std": 0.5005971193313599,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1881.0,
"completions/mean_length": 626.0,
"completions/mean_terminated_length": 591.8720092773438,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.4865297412643372,
"grad_norm": 0.2117873494491709,
"learning_rate": 6.185401888577487e-07,
"loss": 0.0277,
"num_tokens": 23548552.0,
"reward": 0.6171875,
"reward_std": 0.2188364714384079,
"rewards/accuracy_reward/mean": 0.6171875,
"rewards/accuracy_reward/std": 0.48702529072761536,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1887.0,
"completions/mean_length": 807.8828125,
"completions/mean_terminated_length": 762.6963500976562,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.49079754601226994,
"grad_norm": 0.1378906526266812,
"learning_rate": 6.112604669781572e-07,
"loss": 0.0253,
"num_tokens": 23800194.0,
"reward": 0.6796875,
"reward_std": 0.2197500467300415,
"rewards/accuracy_reward/mean": 0.6796875,
"rewards/accuracy_reward/std": 0.4675106406211853,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1927.0,
"completions/mean_length": 755.46875,
"completions/mean_terminated_length": 740.142333984375,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.4950653507602027,
"grad_norm": 0.17213466615909223,
"learning_rate": 6.039558454088795e-07,
"loss": 0.0048,
"num_tokens": 24030330.0,
"reward": 0.63671875,
"reward_std": 0.22305512428283691,
"rewards/accuracy_reward/mean": 0.63671875,
"rewards/accuracy_reward/std": 0.48188701272010803,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 656.859375,
"completions/mean_terminated_length": 623.4720458984375,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"epoch": 0.4993331555081355,
"grad_norm": 0.1892514134210408,
"learning_rate": 5.966279588977766e-07,
"loss": 0.0249,
"num_tokens": 24234654.0,
"reward": 0.640625,
"reward_std": 0.2301558405160904,
"rewards/accuracy_reward/mean": 0.640625,
"rewards/accuracy_reward/std": 0.4807571768760681,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1889.0,
"completions/mean_length": 691.0859375,
"completions/mean_terminated_length": 680.4015502929688,
"completions/min_length": 172.0,
"completions/min_terminated_length": 172.0,
"epoch": 0.5036009602560683,
"grad_norm": 0.17532242354264443,
"learning_rate": 5.892784473993183e-07,
"loss": 0.0122,
"num_tokens": 24449412.0,
"reward": 0.58984375,
"reward_std": 0.21436314284801483,
"rewards/accuracy_reward/mean": 0.58984375,
"rewards/accuracy_reward/std": 0.49282538890838623,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1956.0,
"completions/mean_length": 727.23828125,
"completions/mean_terminated_length": 716.8385620117188,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.507868765004001,
"grad_norm": 0.15731128075810266,
"learning_rate": 5.819089557075688e-07,
"loss": 0.0053,
"num_tokens": 24674217.0,
"reward": 0.5859375,
"reward_std": 0.23079612851142883,
"rewards/accuracy_reward/mean": 0.5859375,
"rewards/accuracy_reward/std": 0.4935242533683777,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 675.8671875,
"completions/mean_terminated_length": 654.0873413085938,
"completions/min_length": 108.0,
"completions/min_terminated_length": 108.0,
"epoch": 0.5121365697519339,
"grad_norm": 0.16762770464548887,
"learning_rate": 5.745211330880872e-07,
"loss": 0.0053,
"num_tokens": 24883719.0,
"reward": 0.6015625,
"reward_std": 0.19438527524471283,
"rewards/accuracy_reward/mean": 0.6015625,
"rewards/accuracy_reward/std": 0.4905354380607605,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1897.0,
"completions/mean_length": 688.73828125,
"completions/mean_terminated_length": 678.035400390625,
"completions/min_length": 183.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.5164043744998666,
"grad_norm": 0.14032321277110027,
"learning_rate": 5.671166329088277e-07,
"loss": 0.0173,
"num_tokens": 25092724.0,
"reward": 0.6015625,
"reward_std": 0.17833054065704346,
"rewards/accuracy_reward/mean": 0.6015625,
"rewards/accuracy_reward/std": 0.4905354380607605,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1918.0,
"completions/mean_length": 623.93359375,
"completions/mean_terminated_length": 607.0474853515625,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.5206721792477994,
"grad_norm": 0.12405235765467854,
"learning_rate": 5.596971122701221e-07,
"loss": 0.0082,
"num_tokens": 25288411.0,
"reward": 0.76171875,
"reward_std": 0.1633341908454895,
"rewards/accuracy_reward/mean": 0.76171875,
"rewards/accuracy_reward/std": 0.4268665909767151,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1963.0,
"completions/mean_length": 650.7734375,
"completions/mean_terminated_length": 628.5952758789062,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.5249399839957322,
"grad_norm": 0.15044450960162567,
"learning_rate": 5.522642316338268e-07,
"loss": 0.0022,
"num_tokens": 25488977.0,
"reward": 0.640625,
"reward_std": 0.15190494060516357,
"rewards/accuracy_reward/mean": 0.640625,
"rewards/accuracy_reward/std": 0.4807571768760681,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 706.6328125,
"completions/mean_terminated_length": 685.34130859375,
"completions/min_length": 158.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.529207788743665,
"grad_norm": 0.21210650194517838,
"learning_rate": 5.448196544517167e-07,
"loss": 0.0254,
"num_tokens": 25707379.0,
"reward": 0.578125,
"reward_std": 0.2531664967536926,
"rewards/accuracy_reward/mean": 0.578125,
"rewards/accuracy_reward/std": 0.49482619762420654,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1961.0,
"completions/mean_length": 797.671875,
"completions/mean_terminated_length": 746.8455200195312,
"completions/min_length": 224.0,
"completions/min_terminated_length": 224.0,
"epoch": 0.5334755934915978,
"grad_norm": 0.1882804309255771,
"learning_rate": 5.373650467932121e-07,
"loss": 0.0148,
"num_tokens": 25947999.0,
"reward": 0.54296875,
"reward_std": 0.26158764958381653,
"rewards/accuracy_reward/mean": 0.54296875,
"rewards/accuracy_reward/std": 0.4991260766983032,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1695.0,
"completions/mean_length": 691.6875,
"completions/mean_terminated_length": 659.1360473632812,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.5377433982395305,
"grad_norm": 0.1922973128812697,
"learning_rate": 5.299020769725171e-07,
"loss": -0.0067,
"num_tokens": 26169399.0,
"reward": 0.59765625,
"reward_std": 0.25118574500083923,
"rewards/accuracy_reward/mean": 0.59765625,
"rewards/accuracy_reward/std": 0.4913311004638672,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.0,
"completions/mean_length": 620.89453125,
"completions/mean_terminated_length": 609.657470703125,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"epoch": 0.5420112029874633,
"grad_norm": 0.11202992839540406,
"learning_rate": 5.224324151752575e-07,
"loss": -0.0027,
"num_tokens": 26363860.0,
"reward": 0.6953125,
"reward_std": 0.14716076850891113,
"rewards/accuracy_reward/mean": 0.6953125,
"rewards/accuracy_reward/std": 0.4611765742301941,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1596.0,
"completions/mean_length": 791.625,
"completions/mean_terminated_length": 766.59765625,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"epoch": 0.5462790077353961,
"grad_norm": 0.15263335447293108,
"learning_rate": 5.149577330846992e-07,
"loss": 0.0263,
"num_tokens": 26603028.0,
"reward": 0.578125,
"reward_std": 0.17662060260772705,
"rewards/accuracy_reward/mean": 0.5967742204666138,
"rewards/accuracy_reward/std": 0.4915373921394348,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 823.96875,
"completions/mean_terminated_length": 794.592041015625,
"completions/min_length": 153.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.5505468124833289,
"grad_norm": 0.15455545451406247,
"learning_rate": 5.074797035076318e-07,
"loss": 0.0167,
"num_tokens": 26849420.0,
"reward": 0.55859375,
"reward_std": 0.2684224247932434,
"rewards/accuracy_reward/mean": 0.55859375,
"rewards/accuracy_reward/std": 0.4975275993347168,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 814.3046875,
"completions/mean_terminated_length": 784.696044921875,
"completions/min_length": 215.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.5548146172312617,
"grad_norm": 0.1451854352321105,
"learning_rate": 5e-07,
"loss": -0.0009,
"num_tokens": 27101234.0,
"reward": 0.46484375,
"reward_std": 0.17768390476703644,
"rewards/accuracy_reward/mean": 0.46484375,
"rewards/accuracy_reward/std": 0.49973952770233154,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1944.0,
"completions/mean_length": 652.73046875,
"completions/mean_terminated_length": 624.936279296875,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.5590824219791944,
"grad_norm": 0.16448043370772392,
"learning_rate": 4.925202964923683e-07,
"loss": 0.0494,
"num_tokens": 27307317.0,
"reward": 0.734375,
"reward_std": 0.17688506841659546,
"rewards/accuracy_reward/mean": 0.734375,
"rewards/accuracy_reward/std": 0.4425306022167206,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.0,
"completions/mean_length": 799.171875,
"completions/mean_terminated_length": 753.6680297851562,
"completions/min_length": 103.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.5633502267271272,
"grad_norm": 0.15215520006680078,
"learning_rate": 4.850422669153009e-07,
"loss": 0.0332,
"num_tokens": 27550865.0,
"reward": 0.6015625,
"reward_std": 0.25289812684059143,
"rewards/accuracy_reward/mean": 0.6015625,
"rewards/accuracy_reward/std": 0.4905354380607605,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1846.0,
"completions/mean_length": 671.6640625,
"completions/mean_terminated_length": 638.6320190429688,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.5676180314750601,
"grad_norm": 0.17982488060357216,
"learning_rate": 4.775675848247427e-07,
"loss": 0.0252,
"num_tokens": 27755795.0,
"reward": 0.69921875,
"reward_std": 0.24670997262001038,
"rewards/accuracy_reward/mean": 0.69921875,
"rewards/accuracy_reward/std": 0.45949608087539673,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 727.27734375,
"completions/mean_terminated_length": 695.5800170898438,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.5718858362229928,
"grad_norm": 0.18005160823929492,
"learning_rate": 4.700979230274829e-07,
"loss": 0.0189,
"num_tokens": 27983106.0,
"reward": 0.55078125,
"reward_std": 0.20384502410888672,
"rewards/accuracy_reward/mean": 0.55078125,
"rewards/accuracy_reward/std": 0.49838894605636597,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1807.0,
"completions/mean_length": 689.609375,
"completions/mean_terminated_length": 662.5498046875,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.5761536409709256,
"grad_norm": 0.14230587148619128,
"learning_rate": 4.626349532067879e-07,
"loss": 0.0063,
"num_tokens": 28208398.0,
"reward": 0.5859375,
"reward_std": 0.19503436982631683,
"rewards/accuracy_reward/mean": 0.5859375,
"rewards/accuracy_reward/std": 0.4935242533683777,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1827.0,
"completions/mean_length": 699.90234375,
"completions/mean_terminated_length": 678.5040283203125,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.5804214457188583,
"grad_norm": 0.20157881415963988,
"learning_rate": 4.5518034554828327e-07,
"loss": 0.0413,
"num_tokens": 28418677.0,
"reward": 0.60546875,
"reward_std": 0.26501142978668213,
"rewards/accuracy_reward/mean": 0.60546875,
"rewards/accuracy_reward/std": 0.48970720171928406,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1706.0,
"completions/mean_length": 723.7890625,
"completions/mean_terminated_length": 702.7698974609375,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.5846892504667911,
"grad_norm": 0.16773141819498977,
"learning_rate": 4.477357683661733e-07,
"loss": 0.0307,
"num_tokens": 28645927.0,
"reward": 0.53515625,
"reward_std": 0.21238481998443604,
"rewards/accuracy_reward/mean": 0.53515625,
"rewards/accuracy_reward/std": 0.49973952770233154,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1852.0,
"completions/mean_length": 727.3828125,
"completions/mean_terminated_length": 701.0757446289062,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.588957055214724,
"grad_norm": 0.17450962048980123,
"learning_rate": 4.403028877298779e-07,
"loss": 0.041,
"num_tokens": 28871649.0,
"reward": 0.51953125,
"reward_std": 0.23330485820770264,
"rewards/accuracy_reward/mean": 0.51953125,
"rewards/accuracy_reward/std": 0.5005971193313599,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 762.8828125,
"completions/mean_terminated_length": 716.0567016601562,
"completions/min_length": 240.0,
"completions/min_terminated_length": 240.0,
"epoch": 0.5932248599626567,
"grad_norm": 0.1457325565330749,
"learning_rate": 4.328833670911724e-07,
"loss": 0.0343,
"num_tokens": 29103011.0,
"reward": 0.515625,
"reward_std": 0.19503435492515564,
"rewards/accuracy_reward/mean": 0.5322580933570862,
"rewards/accuracy_reward/std": 0.4999673366546631,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1941.0,
"completions/mean_length": 738.21875,
"completions/mean_terminated_length": 695.9677124023438,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.5974926647105895,
"grad_norm": 0.16245529520864127,
"learning_rate": 4.254788669119127e-07,
"loss": 0.0494,
"num_tokens": 29329851.0,
"reward": 0.6171875,
"reward_std": 0.25658145546913147,
"rewards/accuracy_reward/mean": 0.6171875,
"rewards/accuracy_reward/std": 0.48702529072761536,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1815.0,
"completions/max_terminated_length": 1815.0,
"completions/mean_length": 723.453125,
"completions/mean_terminated_length": 723.453125,
"completions/min_length": 248.0,
"completions/min_terminated_length": 248.0,
"epoch": 0.6017604694585222,
"grad_norm": 0.15294716618010246,
"learning_rate": 4.180910442924311e-07,
"loss": 0.0056,
"num_tokens": 29552751.0,
"reward": 0.61328125,
"reward_std": 0.2106798142194748,
"rewards/accuracy_reward/mean": 0.61328125,
"rewards/accuracy_reward/std": 0.4879522919654846,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.0,
"completions/mean_length": 717.20703125,
"completions/mean_terminated_length": 690.6972045898438,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.6060282742064551,
"grad_norm": 0.16792896574939836,
"learning_rate": 4.107215526006817e-07,
"loss": 0.021,
"num_tokens": 29769820.0,
"reward": 0.5078125,
"reward_std": 0.2486819177865982,
"rewards/accuracy_reward/mean": 0.5078125,
"rewards/accuracy_reward/std": 0.5009182691574097,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2000.0,
"completions/mean_length": 682.1875,
"completions/mean_terminated_length": 671.4330444335938,
"completions/min_length": 210.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.6102960789543879,
"grad_norm": 0.150865313084476,
"learning_rate": 4.0337204110222347e-07,
"loss": 0.0116,
"num_tokens": 29980868.0,
"reward": 0.6953125,
"reward_std": 0.18884865939617157,
"rewards/accuracy_reward/mean": 0.6953125,
"rewards/accuracy_reward/std": 0.4611765742301941,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1900.0,
"completions/mean_length": 685.66015625,
"completions/mean_terminated_length": 664.0357666015625,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"epoch": 0.6145638837023206,
"grad_norm": 0.1953752730366482,
"learning_rate": 3.960441545911204e-07,
"loss": 0.0241,
"num_tokens": 30194781.0,
"reward": 0.609375,
"reward_std": 0.226848304271698,
"rewards/accuracy_reward/mean": 0.609375,
"rewards/accuracy_reward/std": 0.48884621262550354,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1773.0,
"completions/mean_length": 713.00390625,
"completions/mean_terminated_length": 691.8135375976562,
"completions/min_length": 84.0,
"completions/min_terminated_length": 84.0,
"epoch": 0.6188316884502534,
"grad_norm": 0.15612651886441767,
"learning_rate": 3.8873953302184283e-07,
"loss": 0.0161,
"num_tokens": 30419150.0,
"reward": 0.50390625,
"reward_std": 0.1810988485813141,
"rewards/accuracy_reward/mean": 0.50390625,
"rewards/accuracy_reward/std": 0.5009641647338867,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1926.0,
"completions/mean_length": 696.49609375,
"completions/mean_terminated_length": 685.8543090820312,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.6230994931981861,
"grad_norm": 0.14637261498886597,
"learning_rate": 3.814598111422513e-07,
"loss": 0.024,
"num_tokens": 30635453.0,
"reward": 0.65625,
"reward_std": 0.20357908308506012,
"rewards/accuracy_reward/mean": 0.65625,
"rewards/accuracy_reward/std": 0.47588926553726196,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1894.0,
"completions/mean_length": 759.41796875,
"completions/mean_terminated_length": 701.563232421875,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"epoch": 0.627367297946119,
"grad_norm": 0.20463532100644813,
"learning_rate": 3.742066181277457e-07,
"loss": 0.0394,
"num_tokens": 30872104.0,
"reward": 0.51953125,
"reward_std": 0.28117847442626953,
"rewards/accuracy_reward/mean": 0.51953125,
"rewards/accuracy_reward/std": 0.5005971193313599,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1898.0,
"completions/mean_length": 789.453125,
"completions/mean_terminated_length": 754.072265625,
"completions/min_length": 279.0,
"completions/min_terminated_length": 279.0,
"epoch": 0.6316351026940518,
"grad_norm": 0.15130445176879567,
"learning_rate": 3.669815772166625e-07,
"loss": 0.0299,
"num_tokens": 31115084.0,
"reward": 0.48828125,
"reward_std": 0.2386942058801651,
"rewards/accuracy_reward/mean": 0.48828125,
"rewards/accuracy_reward/std": 0.5008418560028076,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1673.0,
"completions/mean_length": 695.59375,
"completions/mean_terminated_length": 684.9448852539062,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.6359029074419845,
"grad_norm": 0.17477379892719042,
"learning_rate": 3.5978630534699865e-07,
"loss": 0.0265,
"num_tokens": 31332292.0,
"reward": 0.5625,
"reward_std": 0.25658145546913147,
"rewards/accuracy_reward/mean": 0.5625,
"rewards/accuracy_reward/std": 0.49705013632774353,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1865.0,
"completions/mean_length": 712.81640625,
"completions/mean_terminated_length": 686.2191162109375,
"completions/min_length": 198.0,
"completions/min_terminated_length": 198.0,
"epoch": 0.6401707121899173,
"grad_norm": 0.18543376646343887,
"learning_rate": 3.526224127945478e-07,
"loss": 0.0215,
"num_tokens": 31550877.0,
"reward": 0.640625,
"reward_std": 0.25487154722213745,
"rewards/accuracy_reward/mean": 0.640625,
"rewards/accuracy_reward/std": 0.4807571768760681,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1945.0,
"completions/mean_length": 604.0859375,
"completions/mean_terminated_length": 592.716552734375,
"completions/min_length": 154.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.6444385169378501,
"grad_norm": 0.18278541964037912,
"learning_rate": 3.454915028125263e-07,
"loss": 0.0299,
"num_tokens": 31745163.0,
"reward": 0.73046875,
"reward_std": 0.2293570339679718,
"rewards/accuracy_reward/mean": 0.73046875,
"rewards/accuracy_reward/std": 0.44458550214767456,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1759.0,
"completions/max_terminated_length": 1759.0,
"completions/mean_length": 649.3671875,
"completions/mean_terminated_length": 649.3671875,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.6487063216857829,
"grad_norm": 2.653852717153093,
"learning_rate": 3.3839517127277004e-07,
"loss": 0.0311,
"num_tokens": 31947089.0,
"reward": 0.6171875,
"reward_std": 0.2026655077934265,
"rewards/accuracy_reward/mean": 0.6171875,
"rewards/accuracy_reward/std": 0.48702529072761536,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1766.0,
"completions/mean_length": 692.83203125,
"completions/mean_terminated_length": 631.9877319335938,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.6529741264337157,
"grad_norm": 0.14274698223421514,
"learning_rate": 3.31335006308585e-07,
"loss": 0.0308,
"num_tokens": 32173766.0,
"reward": 0.57421875,
"reward_std": 0.14058801531791687,
"rewards/accuracy_reward/mean": 0.57421875,
"rewards/accuracy_reward/std": 0.49542948603630066,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 711.15625,
"completions/mean_terminated_length": 651.1346435546875,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.6572419311816484,
"grad_norm": 0.13873122233103782,
"learning_rate": 3.243125879593286e-07,
"loss": 0.0217,
"num_tokens": 32395510.0,
"reward": 0.64453125,
"reward_std": 0.14992907643318176,
"rewards/accuracy_reward/mean": 0.64453125,
"rewards/accuracy_reward/std": 0.4795927405357361,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 670.67578125,
"completions/mean_terminated_length": 659.8306884765625,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.6615097359295812,
"grad_norm": 0.16194091457756485,
"learning_rate": 3.173294878168025e-07,
"loss": 0.02,
"num_tokens": 32603435.0,
"reward": 0.6875,
"reward_std": 0.22738119959831238,
"rewards/accuracy_reward/mean": 0.6875,
"rewards/accuracy_reward/std": 0.4644203782081604,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 668.96875,
"completions/mean_terminated_length": 641.498046875,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.665777540677514,
"grad_norm": 0.20775701511344793,
"learning_rate": 3.1038726867353583e-07,
"loss": 0.0147,
"num_tokens": 32815211.0,
"reward": 0.609375,
"reward_std": 0.24382543563842773,
"rewards/accuracy_reward/mean": 0.609375,
"rewards/accuracy_reward/std": 0.48884621262550354,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 738.48828125,
"completions/mean_terminated_length": 733.3529663085938,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.6700453454254468,
"grad_norm": 0.16412376814421997,
"learning_rate": 3.034874841730382e-07,
"loss": 0.0326,
"num_tokens": 33039328.0,
"reward": 0.61328125,
"reward_std": 0.19727861881256104,
"rewards/accuracy_reward/mean": 0.61328125,
"rewards/accuracy_reward/std": 0.4879522919654846,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 663.28515625,
"completions/mean_terminated_length": 618.616943359375,
"completions/min_length": 160.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.6743131501733796,
"grad_norm": 0.13990368182253585,
"learning_rate": 2.9663167846209996e-07,
"loss": 0.0185,
"num_tokens": 33246481.0,
"reward": 0.62109375,
"reward_std": 0.16807834804058075,
"rewards/accuracy_reward/mean": 0.62109375,
"rewards/accuracy_reward/std": 0.4860650300979614,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1982.0,
"completions/mean_length": 720.7421875,
"completions/mean_terminated_length": 705.0039672851562,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"epoch": 0.6785809549213123,
"grad_norm": 0.21349501848091973,
"learning_rate": 2.898213858452173e-07,
"loss": 0.0235,
"num_tokens": 33476039.0,
"reward": 0.56640625,
"reward_std": 0.30327552556991577,
"rewards/accuracy_reward/mean": 0.56640625,
"rewards/accuracy_reward/std": 0.4965413510799408,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 698.4375,
"completions/mean_terminated_length": 666.0480346679688,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.6828487596692452,
"grad_norm": 2.7101145922860264,
"learning_rate": 2.8305813044122093e-07,
"loss": 0.0384,
"num_tokens": 33691495.0,
"reward": 0.640625,
"reward_std": 0.18623006343841553,
"rewards/accuracy_reward/mean": 0.640625,
"rewards/accuracy_reward/std": 0.4807571768760681,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1900.0,
"completions/mean_length": 695.64453125,
"completions/mean_terminated_length": 684.9960327148438,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"epoch": 0.6871165644171779,
"grad_norm": 0.19246224673247156,
"learning_rate": 2.763434258421836e-07,
"loss": 0.0258,
"num_tokens": 33903004.0,
"reward": 0.63671875,
"reward_std": 0.2394905686378479,
"rewards/accuracy_reward/mean": 0.63671875,
"rewards/accuracy_reward/std": 0.48188701272010803,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1826.0,
"completions/mean_length": 787.1015625,
"completions/mean_terminated_length": 772.1502075195312,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.6913843691651107,
"grad_norm": 0.17498122004000508,
"learning_rate": 2.696787747746839e-07,
"loss": 0.0354,
"num_tokens": 34144806.0,
"reward": 0.5078125,
"reward_std": 0.2318657785654068,
"rewards/accuracy_reward/mean": 0.5078125,
"rewards/accuracy_reward/std": 0.5009182691574097,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1934.0,
"completions/mean_length": 691.89453125,
"completions/mean_terminated_length": 642.4818115234375,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.6956521739130435,
"grad_norm": 0.15967496784346374,
"learning_rate": 2.6306566876350067e-07,
"loss": 0.0105,
"num_tokens": 34367667.0,
"reward": 0.64453125,
"reward_std": 0.1528160572052002,
"rewards/accuracy_reward/mean": 0.64453125,
"rewards/accuracy_reward/std": 0.4795927405357361,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1882.0,
"completions/mean_length": 709.24609375,
"completions/mean_terminated_length": 682.5776977539062,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.6999199786609762,
"grad_norm": 0.16783695896488116,
"learning_rate": 2.5650558779781635e-07,
"loss": 0.0212,
"num_tokens": 34586250.0,
"reward": 0.5546875,
"reward_std": 0.23448437452316284,
"rewards/accuracy_reward/mean": 0.5546875,
"rewards/accuracy_reward/std": 0.49797385931015015,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1936.0,
"completions/mean_length": 641.84765625,
"completions/mean_terminated_length": 630.7755737304688,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.7041877834089091,
"grad_norm": 0.13916296254919422,
"learning_rate": 2.500000000000001e-07,
"loss": 0.0377,
"num_tokens": 34785107.0,
"reward": 0.640625,
"reward_std": 0.18477579951286316,
"rewards/accuracy_reward/mean": 0.640625,
"rewards/accuracy_reward/std": 0.4807571768760681,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1818.0,
"completions/mean_length": 698.90625,
"completions/mean_terminated_length": 682.9091186523438,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.7084555881568418,
"grad_norm": 0.16427060939231672,
"learning_rate": 2.4355036129704696e-07,
"loss": 0.022,
"num_tokens": 35000283.0,
"reward": 0.55859375,
"reward_std": 0.26143547892570496,
"rewards/accuracy_reward/mean": 0.55859375,
"rewards/accuracy_reward/std": 0.4975275993347168,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1919.0,
"completions/max_terminated_length": 1919.0,
"completions/mean_length": 690.453125,
"completions/mean_terminated_length": 690.453125,
"completions/min_length": 250.0,
"completions/min_terminated_length": 250.0,
"epoch": 0.7127233929047746,
"grad_norm": 0.16192550263946426,
"learning_rate": 2.371581150947476e-07,
"loss": 0.003,
"num_tokens": 35210223.0,
"reward": 0.5859375,
"reward_std": 0.18675412237644196,
"rewards/accuracy_reward/mean": 0.5859375,
"rewards/accuracy_reward/std": 0.4935242533683777,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 723.69140625,
"completions/mean_terminated_length": 697.310791015625,
"completions/min_length": 175.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.7169911976527074,
"grad_norm": 0.16721621945604426,
"learning_rate": 2.3082469195465893e-07,
"loss": -0.0017,
"num_tokens": 35434304.0,
"reward": 0.60546875,
"reward_std": 0.22396379709243774,
"rewards/accuracy_reward/mean": 0.60546875,
"rewards/accuracy_reward/std": 0.48970720171928406,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1851.0,
"completions/mean_length": 615.58203125,
"completions/mean_terminated_length": 604.3031616210938,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.7212590024006402,
"grad_norm": 0.18161777198423945,
"learning_rate": 2.2455150927394878e-07,
"loss": 0.0457,
"num_tokens": 35625213.0,
"reward": 0.6640625,
"reward_std": 0.23395150899887085,
"rewards/accuracy_reward/mean": 0.6640625,
"rewards/accuracy_reward/std": 0.4732423722743988,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1994.0,
"completions/mean_length": 633.375,
"completions/mean_terminated_length": 622.2362060546875,
"completions/min_length": 140.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.725526807148573,
"grad_norm": 0.15728519931429938,
"learning_rate": 2.1833997096818895e-07,
"loss": 0.0157,
"num_tokens": 35830397.0,
"reward": 0.703125,
"reward_std": 0.21778053045272827,
"rewards/accuracy_reward/mean": 0.703125,
"rewards/accuracy_reward/std": 0.45777595043182373,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1954.0,
"completions/mean_length": 710.61328125,
"completions/mean_terminated_length": 667.4717407226562,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.7297946118965057,
"grad_norm": 0.19673166776660161,
"learning_rate": 2.121914671571633e-07,
"loss": 0.0184,
"num_tokens": 36053362.0,
"reward": 0.5859375,
"reward_std": 0.2480328232049942,
"rewards/accuracy_reward/mean": 0.5859375,
"rewards/accuracy_reward/std": 0.4935242533683777,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1657.0,
"completions/mean_length": 645.01953125,
"completions/mean_terminated_length": 628.3834228515625,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.7340624166444385,
"grad_norm": 0.18662654855006655,
"learning_rate": 2.0610737385376348e-07,
"loss": 0.0149,
"num_tokens": 36257215.0,
"reward": 0.625,
"reward_std": 0.2225247174501419,
"rewards/accuracy_reward/mean": 0.625,
"rewards/accuracy_reward/std": 0.4850712716579437,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1933.0,
"completions/mean_length": 729.95703125,
"completions/mean_terminated_length": 724.7882690429688,
"completions/min_length": 183.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.7383302213923713,
"grad_norm": 0.15914760867061006,
"learning_rate": 2.0008905265604315e-07,
"loss": 0.016,
"num_tokens": 36487676.0,
"reward": 0.61328125,
"reward_std": 0.21397364139556885,
"rewards/accuracy_reward/mean": 0.61328125,
"rewards/accuracy_reward/std": 0.4879522919654846,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1254.0,
"completions/mean_length": 512.953125,
"completions/mean_terminated_length": 506.933349609375,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.7425980261403041,
"grad_norm": 0.21401549472539927,
"learning_rate": 1.9413785044249676e-07,
"loss": 0.0099,
"num_tokens": 36652152.0,
"reward": 0.828125,
"reward_std": 0.22962790727615356,
"rewards/accuracy_reward/mean": 0.828125,
"rewards/accuracy_reward/std": 0.3780108094215393,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1919.0,
"completions/mean_length": 823.72265625,
"completions/mean_terminated_length": 799.3346557617188,
"completions/min_length": 223.0,
"completions/min_terminated_length": 223.0,
"epoch": 0.7468658308882369,
"grad_norm": 0.1935931177350886,
"learning_rate": 1.8825509907063326e-07,
"loss": 0.0157,
"num_tokens": 36909769.0,
"reward": 0.546875,
"reward_std": 0.26394912600517273,
"rewards/accuracy_reward/mean": 0.546875,
"rewards/accuracy_reward/std": 0.4987730085849762,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1846.0,
"completions/max_terminated_length": 1846.0,
"completions/mean_length": 588.75390625,
"completions/mean_terminated_length": 588.75390625,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.7511336356361696,
"grad_norm": 0.18692109373962446,
"learning_rate": 1.824421150789106e-07,
"loss": 0.0363,
"num_tokens": 37094778.0,
"reward": 0.71484375,
"reward_std": 0.22278673946857452,
"rewards/accuracy_reward/mean": 0.71484375,
"rewards/accuracy_reward/std": 0.4523732364177704,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1952.0,
"completions/max_terminated_length": 1952.0,
"completions/mean_length": 611.62109375,
"completions/mean_terminated_length": 611.62109375,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.7554014403841024,
"grad_norm": 0.19122585153150984,
"learning_rate": 1.7670019939210023e-07,
"loss": 0.0113,
"num_tokens": 37296913.0,
"reward": 0.63671875,
"reward_std": 0.1732081174850464,
"rewards/accuracy_reward/mean": 0.63671875,
"rewards/accuracy_reward/std": 0.48188701272010803,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 737.125,
"completions/mean_terminated_length": 726.8031616210938,
"completions/min_length": 237.0,
"completions/min_terminated_length": 237.0,
"epoch": 0.7596692451320353,
"grad_norm": 0.20920060817748004,
"learning_rate": 1.710306370301437e-07,
"loss": 0.0334,
"num_tokens": 37521321.0,
"reward": 0.61328125,
"reward_std": 0.29116860032081604,
"rewards/accuracy_reward/mean": 0.61328125,
"rewards/accuracy_reward/std": 0.4879522919654846,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 826.265625,
"completions/mean_terminated_length": 781.7490234375,
"completions/min_length": 208.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.763937049879968,
"grad_norm": 0.16926216445641215,
"learning_rate": 1.6543469682057104e-07,
"loss": 0.0227,
"num_tokens": 37767125.0,
"reward": 0.47265625,
"reward_std": 0.2974705398082733,
"rewards/accuracy_reward/mean": 0.47265625,
"rewards/accuracy_reward/std": 0.5002297759056091,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1650.0,
"completions/mean_length": 621.671875,
"completions/mean_terminated_length": 610.44091796875,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.7682048546279008,
"grad_norm": 0.25834397337634973,
"learning_rate": 1.599136311145402e-07,
"loss": 0.0052,
"num_tokens": 37958313.0,
"reward": 0.63671875,
"reward_std": 0.2747243642807007,
"rewards/accuracy_reward/mean": 0.63671875,
"rewards/accuracy_reward/std": 0.48188701272010803,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1872.0,
"completions/mean_length": 631.9296875,
"completions/mean_terminated_length": 609.452392578125,
"completions/min_length": 224.0,
"completions/min_terminated_length": 224.0,
"epoch": 0.7724726593758335,
"grad_norm": 0.21432978221288052,
"learning_rate": 1.5446867550656767e-07,
"loss": 0.0292,
"num_tokens": 38156599.0,
"reward": 0.671875,
"reward_std": 0.27328526973724365,
"rewards/accuracy_reward/mean": 0.671875,
"rewards/accuracy_reward/std": 0.47045037150382996,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 752.87890625,
"completions/mean_terminated_length": 716.4698486328125,
"completions/min_length": 210.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.7767404641237663,
"grad_norm": 0.17437061893863778,
"learning_rate": 1.4910104855800426e-07,
"loss": 0.0221,
"num_tokens": 38383904.0,
"reward": 0.6328125,
"reward_std": 0.19780902564525604,
"rewards/accuracy_reward/mean": 0.6328125,
"rewards/accuracy_reward/std": 0.48298248648643494,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1945.0,
"completions/mean_length": 732.80859375,
"completions/mean_terminated_length": 717.2134399414062,
"completions/min_length": 144.0,
"completions/min_terminated_length": 144.0,
"epoch": 0.7810082688716992,
"grad_norm": 0.1598318880580154,
"learning_rate": 1.4381195152432769e-07,
"loss": 0.0221,
"num_tokens": 38611175.0,
"reward": 0.6328125,
"reward_std": 0.2296190708875656,
"rewards/accuracy_reward/mean": 0.6328125,
"rewards/accuracy_reward/std": 0.48298248648643494,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1900.0,
"completions/mean_length": 717.453125,
"completions/mean_terminated_length": 690.9482421875,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.7852760736196319,
"grad_norm": 0.18863795184539878,
"learning_rate": 1.3860256808630427e-07,
"loss": 0.0162,
"num_tokens": 38834187.0,
"reward": 0.5390625,
"reward_std": 0.21158601343631744,
"rewards/accuracy_reward/mean": 0.5390625,
"rewards/accuracy_reward/std": 0.4994482398033142,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1636.0,
"completions/mean_length": 652.3125,
"completions/mean_terminated_length": 635.7628784179688,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.7895438783675647,
"grad_norm": 0.20121677777375388,
"learning_rate": 1.3347406408508694e-07,
"loss": 0.0113,
"num_tokens": 39035419.0,
"reward": 0.55859375,
"reward_std": 0.26382553577423096,
"rewards/accuracy_reward/mean": 0.55859375,
"rewards/accuracy_reward/std": 0.4975275993347168,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1879.0,
"completions/mean_length": 618.82421875,
"completions/mean_terminated_length": 584.5240478515625,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.7938116831154974,
"grad_norm": 0.16491042479146237,
"learning_rate": 1.284275872613028e-07,
"loss": 0.0368,
"num_tokens": 39231782.0,
"reward": 0.6328125,
"reward_std": 0.1459837257862091,
"rewards/accuracy_reward/mean": 0.6328125,
"rewards/accuracy_reward/std": 0.48298248648643494,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1939.0,
"completions/mean_length": 635.5625,
"completions/mean_terminated_length": 607.4263305664062,
"completions/min_length": 119.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.7980794878634303,
"grad_norm": 0.23455442334789772,
"learning_rate": 1.2346426699819456e-07,
"loss": 0.0442,
"num_tokens": 39433758.0,
"reward": 0.68359375,
"reward_std": 0.19989721477031708,
"rewards/accuracy_reward/mean": 0.68359375,
"rewards/accuracy_reward/std": 0.4659844934940338,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1999.0,
"completions/mean_length": 713.671875,
"completions/mean_terminated_length": 692.4921264648438,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"epoch": 0.8023472926113631,
"grad_norm": 0.15756234156819612,
"learning_rate": 1.1858521406886674e-07,
"loss": -0.0005,
"num_tokens": 39666066.0,
"reward": 0.50390625,
"reward_std": 0.1810988485813141,
"rewards/accuracy_reward/mean": 0.50390625,
"rewards/accuracy_reward/std": 0.5009641647338867,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2036.0,
"completions/mean_length": 776.61328125,
"completions/mean_terminated_length": 719.5305786132812,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.8066150973592958,
"grad_norm": 0.18804973381454748,
"learning_rate": 1.1379152038770029e-07,
"loss": 0.0219,
"num_tokens": 39908447.0,
"reward": 0.4765625,
"reward_std": 0.25513601303100586,
"rewards/accuracy_reward/mean": 0.4765625,
"rewards/accuracy_reward/std": 0.5004287362098694,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1898.0,
"completions/mean_length": 791.19921875,
"completions/mean_terminated_length": 761.0360107421875,
"completions/min_length": 255.0,
"completions/min_terminated_length": 255.0,
"epoch": 0.8108829021072286,
"grad_norm": 0.16325751022804733,
"learning_rate": 1.090842587659851e-07,
"loss": 0.0221,
"num_tokens": 40146162.0,
"reward": 0.53125,
"reward_std": 0.24041050672531128,
"rewards/accuracy_reward/mean": 0.53125,
"rewards/accuracy_reward/std": 0.5,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1866.0,
"completions/mean_length": 606.69921875,
"completions/mean_terminated_length": 601.047119140625,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"epoch": 0.8151507068551613,
"grad_norm": 0.1573394710102458,
"learning_rate": 1.044644826718295e-07,
"loss": 0.0311,
"num_tokens": 40331989.0,
"reward": 0.734375,
"reward_std": 0.17347405850887299,
"rewards/accuracy_reward/mean": 0.734375,
"rewards/accuracy_reward/std": 0.4425306022167206,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1443.0,
"completions/mean_length": 645.078125,
"completions/mean_terminated_length": 617.1314697265625,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.8194185116030942,
"grad_norm": 0.4096923043938982,
"learning_rate": 9.99332259943969e-08,
"loss": 0.0204,
"num_tokens": 40531993.0,
"reward": 0.6484375,
"reward_std": 0.25567278265953064,
"rewards/accuracy_reward/mean": 0.6484375,
"rewards/accuracy_reward/std": 0.47839346528053284,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 692.7109375,
"completions/mean_terminated_length": 665.7131958007812,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.823686316351027,
"grad_norm": 0.18040801058119468,
"learning_rate": 9.549150281252632e-08,
"loss": 0.002,
"num_tokens": 40743095.0,
"reward": 0.60546875,
"reward_std": 0.2296215295791626,
"rewards/accuracy_reward/mean": 0.60546875,
"rewards/accuracy_reward/std": 0.48970720171928406,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1979.0,
"completions/mean_length": 665.02734375,
"completions/mean_terminated_length": 643.075439453125,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"epoch": 0.8279541210989597,
"grad_norm": 0.17019939572706502,
"learning_rate": 9.114030716778432e-08,
"loss": 0.0224,
"num_tokens": 40947638.0,
"reward": 0.57421875,
"reward_std": 0.1618887335062027,
"rewards/accuracy_reward/mean": 0.57421875,
"rewards/accuracy_reward/std": 0.49542948603630066,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1628.0,
"completions/max_terminated_length": 1628.0,
"completions/mean_length": 726.453125,
"completions/mean_terminated_length": 726.453125,
"completions/min_length": 219.0,
"completions/min_terminated_length": 219.0,
"epoch": 0.8322219258468925,
"grad_norm": 0.19470897147732638,
"learning_rate": 8.688061284200265e-08,
"loss": 0.0125,
"num_tokens": 41171050.0,
"reward": 0.5703125,
"reward_std": 0.2899891138076782,
"rewards/accuracy_reward/mean": 0.5703125,
"rewards/accuracy_reward/std": 0.4960011839866638,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1920.0,
"completions/mean_length": 715.34375,
"completions/mean_terminated_length": 694.1904907226562,
"completions/min_length": 234.0,
"completions/min_terminated_length": 234.0,
"epoch": 0.8364897305948252,
"grad_norm": 0.16961140906916883,
"learning_rate": 8.271337313934867e-08,
"loss": 0.0199,
"num_tokens": 41394090.0,
"reward": 0.6015625,
"reward_std": 0.21147862076759338,
"rewards/accuracy_reward/mean": 0.6015625,
"rewards/accuracy_reward/std": 0.4905354380607605,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1998.0,
"completions/mean_length": 753.76953125,
"completions/mean_terminated_length": 717.385498046875,
"completions/min_length": 215.0,
"completions/min_terminated_length": 215.0,
"epoch": 0.8407575353427581,
"grad_norm": 0.14267834470354487,
"learning_rate": 7.863952067298041e-08,
"loss": -0.004,
"num_tokens": 41621135.0,
"reward": 0.68359375,
"reward_std": 0.1810988485813141,
"rewards/accuracy_reward/mean": 0.68359375,
"rewards/accuracy_reward/std": 0.4659844934940338,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1730.0,
"completions/mean_length": 717.44921875,
"completions/mean_terminated_length": 696.3294067382812,
"completions/min_length": 207.0,
"completions/min_terminated_length": 207.0,
"epoch": 0.8450253400906909,
"grad_norm": 0.18522820075393684,
"learning_rate": 7.465996715633027e-08,
"loss": 0.0201,
"num_tokens": 41845914.0,
"reward": 0.59375,
"reward_std": 0.3056321144104004,
"rewards/accuracy_reward/mean": 0.59375,
"rewards/accuracy_reward/std": 0.49209436774253845,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1848.0,
"completions/mean_length": 703.875,
"completions/mean_terminated_length": 698.6039428710938,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"epoch": 0.8492931448386236,
"grad_norm": 0.17512294585566626,
"learning_rate": 7.077560319906694e-08,
"loss": 0.0129,
"num_tokens": 42060298.0,
"reward": 0.60546875,
"reward_std": 0.1737360954284668,
"rewards/accuracy_reward/mean": 0.60546875,
"rewards/accuracy_reward/std": 0.48970720171928406,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1806.0,
"completions/mean_length": 686.66015625,
"completions/mean_terminated_length": 648.3895263671875,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"epoch": 0.8535609495865564,
"grad_norm": 0.12460188705163991,
"learning_rate": 6.698729810778064e-08,
"loss": 0.0108,
"num_tokens": 42271123.0,
"reward": 0.68359375,
"reward_std": 0.135463148355484,
"rewards/accuracy_reward/mean": 0.68359375,
"rewards/accuracy_reward/std": 0.4659844934940338,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1881.0,
"completions/mean_length": 628.1796875,
"completions/mean_terminated_length": 582.3790283203125,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.8578287543344892,
"grad_norm": 1.1717296106182498,
"learning_rate": 6.329589969143517e-08,
"loss": 0.0238,
"num_tokens": 42468385.0,
"reward": 0.640625,
"reward_std": 0.2137165069580078,
"rewards/accuracy_reward/mean": 0.640625,
"rewards/accuracy_reward/std": 0.4807571768760681,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 201
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2036.0,
"completions/mean_length": 730.0546875,
"completions/mean_terminated_length": 698.4240112304688,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"epoch": 0.862096559082422,
"grad_norm": 0.15656348367686632,
"learning_rate": 5.9702234071631e-08,
"loss": 0.0016,
"num_tokens": 42693055.0,
"reward": 0.56640625,
"reward_std": 0.17635467648506165,
"rewards/accuracy_reward/mean": 0.56640625,
"rewards/accuracy_reward/std": 0.4965413510799408,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1857.0,
"completions/mean_length": 724.73046875,
"completions/mean_terminated_length": 692.9720458984375,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.8663643638303548,
"grad_norm": 0.1459269992065347,
"learning_rate": 5.620710549772295e-08,
"loss": 0.0421,
"num_tokens": 42912986.0,
"reward": 0.59375,
"reward_std": 0.16860876977443695,
"rewards/accuracy_reward/mean": 0.59375,
"rewards/accuracy_reward/std": 0.49209436774253845,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 203
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1984.0,
"completions/mean_length": 693.640625,
"completions/mean_terminated_length": 672.1428833007812,
"completions/min_length": 232.0,
"completions/min_terminated_length": 232.0,
"epoch": 0.8706321685782875,
"grad_norm": 0.1809249668666066,
"learning_rate": 5.2811296166831666e-08,
"loss": 0.0218,
"num_tokens": 43125238.0,
"reward": 0.61328125,
"reward_std": 0.24606087803840637,
"rewards/accuracy_reward/mean": 0.61328125,
"rewards/accuracy_reward/std": 0.4879522919654846,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1649.0,
"completions/mean_length": 711.7578125,
"completions/mean_terminated_length": 695.9130859375,
"completions/min_length": 212.0,
"completions/min_terminated_length": 212.0,
"epoch": 0.8748999733262203,
"grad_norm": 0.20158349817144566,
"learning_rate": 4.951556604879048e-08,
"loss": 0.0166,
"num_tokens": 43348424.0,
"reward": 0.58984375,
"reward_std": 0.27434366941452026,
"rewards/accuracy_reward/mean": 0.58984375,
"rewards/accuracy_reward/std": 0.49282538890838623,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 723.2421875,
"completions/mean_terminated_length": 686.0,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.8791677780741531,
"grad_norm": 0.16555289267225737,
"learning_rate": 4.6320652716067555e-08,
"loss": 0.027,
"num_tokens": 43566182.0,
"reward": 0.58203125,
"reward_std": 0.25540196895599365,
"rewards/accuracy_reward/mean": 0.58203125,
"rewards/accuracy_reward/std": 0.49419113993644714,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 206
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2003.0,
"completions/mean_length": 716.3203125,
"completions/mean_terminated_length": 689.7928466796875,
"completions/min_length": 247.0,
"completions/min_terminated_length": 247.0,
"epoch": 0.8834355828220859,
"grad_norm": 0.15251148614367968,
"learning_rate": 4.322727117869951e-08,
"loss": 0.0125,
"num_tokens": 43784416.0,
"reward": 0.66796875,
"reward_std": 0.21633265912532806,
"rewards/accuracy_reward/mean": 0.66796875,
"rewards/accuracy_reward/std": 0.4718646705150604,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 207
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1929.0,
"completions/mean_length": 688.9609375,
"completions/mean_terminated_length": 667.388916015625,
"completions/min_length": 183.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.8877033875700187,
"grad_norm": 0.17443255375688374,
"learning_rate": 4.023611372427471e-08,
"loss": 0.0037,
"num_tokens": 44001166.0,
"reward": 0.625,
"reward_std": 0.2396092414855957,
"rewards/accuracy_reward/mean": 0.625,
"rewards/accuracy_reward/std": 0.4850712716579437,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1807.0,
"completions/mean_length": 704.53515625,
"completions/mean_terminated_length": 688.6047973632812,
"completions/min_length": 166.0,
"completions/min_terminated_length": 166.0,
"epoch": 0.8919711923179514,
"grad_norm": 0.16853797409498864,
"learning_rate": 3.734784976300165e-08,
"loss": 0.0163,
"num_tokens": 44226015.0,
"reward": 0.5390625,
"reward_std": 0.25091981887817383,
"rewards/accuracy_reward/mean": 0.5390625,
"rewards/accuracy_reward/std": 0.4994482398033142,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 209
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.0,
"completions/mean_length": 738.59375,
"completions/mean_terminated_length": 717.8095703125,
"completions/min_length": 224.0,
"completions/min_terminated_length": 224.0,
"epoch": 0.8962389970658843,
"grad_norm": 0.14429226057527744,
"learning_rate": 3.456312567789793e-08,
"loss": 0.0026,
"num_tokens": 44447887.0,
"reward": 0.671875,
"reward_std": 0.24396878480911255,
"rewards/accuracy_reward/mean": 0.671875,
"rewards/accuracy_reward/std": 0.47045037150382996,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.0,
"completions/mean_length": 647.8515625,
"completions/mean_terminated_length": 614.248046875,
"completions/min_length": 190.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.900506801813817,
"grad_norm": 0.21336156797511663,
"learning_rate": 3.188256468013139e-08,
"loss": 0.0319,
"num_tokens": 44648865.0,
"reward": 0.6640625,
"reward_std": 0.23357079923152924,
"rewards/accuracy_reward/mean": 0.6640625,
"rewards/accuracy_reward/std": 0.4732423722743988,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 211
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2046.0,
"completions/mean_length": 732.5859375,
"completions/mean_terminated_length": 690.1531982421875,
"completions/min_length": 173.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.9047746065617498,
"grad_norm": 0.1594541338224898,
"learning_rate": 2.9306766669548457e-08,
"loss": 0.0178,
"num_tokens": 44879447.0,
"reward": 0.51953125,
"reward_std": 0.21660104393959045,
"rewards/accuracy_reward/mean": 0.51953125,
"rewards/accuracy_reward/std": 0.5005971193313599,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1799.0,
"completions/mean_length": 666.71484375,
"completions/mean_terminated_length": 644.7897338867188,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"epoch": 0.9090424113096826,
"grad_norm": 0.13402021740421238,
"learning_rate": 2.6836308100417872e-08,
"loss": 0.0209,
"num_tokens": 45084174.0,
"reward": 0.60546875,
"reward_std": 0.1621571183204651,
"rewards/accuracy_reward/mean": 0.60546875,
"rewards/accuracy_reward/std": 0.48970720171928406,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2036.0,
"completions/mean_length": 640.21484375,
"completions/mean_terminated_length": 600.6385498046875,
"completions/min_length": 81.0,
"completions/min_terminated_length": 81.0,
"epoch": 0.9133102160576153,
"grad_norm": 0.16506792868914458,
"learning_rate": 2.4471741852423233e-08,
"loss": 0.0484,
"num_tokens": 45280933.0,
"reward": 0.76953125,
"reward_std": 0.16439500451087952,
"rewards/accuracy_reward/mean": 0.76953125,
"rewards/accuracy_reward/std": 0.4219578504562378,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 214
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 743.671875,
"completions/mean_terminated_length": 717.6892700195312,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"epoch": 0.9175780208055482,
"grad_norm": 0.152763201379616,
"learning_rate": 2.2213597106929605e-08,
"loss": 0.0177,
"num_tokens": 45510841.0,
"reward": 0.6015625,
"reward_std": 0.19503435492515564,
"rewards/accuracy_reward/mean": 0.6015625,
"rewards/accuracy_reward/std": 0.4905354380607605,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 733.44140625,
"completions/mean_terminated_length": 707.2550048828125,
"completions/min_length": 217.0,
"completions/min_terminated_length": 217.0,
"epoch": 0.9218458255534809,
"grad_norm": 0.1645677770956383,
"learning_rate": 2.0062379228555525e-08,
"loss": -0.002,
"num_tokens": 45732058.0,
"reward": 0.67578125,
"reward_std": 0.22973774373531342,
"rewards/accuracy_reward/mean": 0.67578125,
"rewards/accuracy_reward/std": 0.46899911761283875,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1940.0,
"completions/mean_length": 687.875,
"completions/mean_terminated_length": 677.1653442382812,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.9261136303014137,
"grad_norm": 0.24378981471372618,
"learning_rate": 1.8018569652073378e-08,
"loss": 0.0226,
"num_tokens": 45946338.0,
"reward": 0.65234375,
"reward_std": 0.24302664399147034,
"rewards/accuracy_reward/mean": 0.65234375,
"rewards/accuracy_reward/std": 0.4771590530872345,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 217
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1885.0,
"completions/mean_length": 725.7890625,
"completions/mean_terminated_length": 683.1370849609375,
"completions/min_length": 251.0,
"completions/min_terminated_length": 251.0,
"epoch": 0.9303814350493465,
"grad_norm": 0.16104351367263942,
"learning_rate": 1.6082625774666792e-08,
"loss": 0.0192,
"num_tokens": 46169276.0,
"reward": 0.58203125,
"reward_std": 0.17662307620048523,
"rewards/accuracy_reward/mean": 0.58203125,
"rewards/accuracy_reward/std": 0.49419113993644714,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 218
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1949.0,
"completions/mean_length": 750.88671875,
"completions/mean_terminated_length": 681.4938354492188,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"epoch": 0.9346492397972793,
"grad_norm": 0.15362390550001964,
"learning_rate": 1.4254980853566246e-08,
"loss": 0.0354,
"num_tokens": 46399943.0,
"reward": 0.58984375,
"reward_std": 0.19279402494430542,
"rewards/accuracy_reward/mean": 0.58984375,
"rewards/accuracy_reward/std": 0.49282538890838623,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 219
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1451.0,
"completions/mean_length": 669.7421875,
"completions/mean_terminated_length": 636.6640014648438,
"completions/min_length": 180.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.9389170445452121,
"grad_norm": 0.13543826848213528,
"learning_rate": 1.253604390908819e-08,
"loss": 0.0214,
"num_tokens": 46605885.0,
"reward": 0.58984375,
"reward_std": 0.13546313345432281,
"rewards/accuracy_reward/mean": 0.58984375,
"rewards/accuracy_reward/std": 0.49282538890838623,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1919.0,
"completions/mean_length": 783.8828125,
"completions/mean_terminated_length": 778.925537109375,
"completions/min_length": 152.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.9431848492931448,
"grad_norm": 0.17759133312220565,
"learning_rate": 1.0926199633097154e-08,
"loss": 0.0247,
"num_tokens": 46854583.0,
"reward": 0.51953125,
"reward_std": 0.1726752519607544,
"rewards/accuracy_reward/mean": 0.51953125,
"rewards/accuracy_reward/std": 0.5005971193313599,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 221
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1763.0,
"completions/mean_length": 649.19140625,
"completions/mean_terminated_length": 632.6047973632812,
"completions/min_length": 239.0,
"completions/min_terminated_length": 239.0,
"epoch": 0.9474526540410776,
"grad_norm": 0.1586721854654231,
"learning_rate": 9.425808302913728e-09,
"loss": 0.0301,
"num_tokens": 47057680.0,
"reward": 0.69921875,
"reward_std": 0.21371403336524963,
"rewards/accuracy_reward/mean": 0.69921875,
"rewards/accuracy_reward/std": 0.45949608087539673,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 222
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 626.34765625,
"completions/mean_terminated_length": 603.7817993164062,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.9517204587890103,
"grad_norm": 0.17483755006208948,
"learning_rate": 8.035205700685165e-09,
"loss": 0.0203,
"num_tokens": 47261201.0,
"reward": 0.69140625,
"reward_std": 0.19556477665901184,
"rewards/accuracy_reward/mean": 0.69140625,
"rewards/accuracy_reward/std": 0.46281787753105164,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 223
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1913.0,
"completions/mean_length": 588.92578125,
"completions/mean_terminated_length": 565.7659301757812,
"completions/min_length": 210.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.9559882635369432,
"grad_norm": 0.16063372689194239,
"learning_rate": 6.754703038239329e-09,
"loss": 0.0156,
"num_tokens": 47445534.0,
"reward": 0.79296875,
"reward_std": 0.17715102434158325,
"rewards/accuracy_reward/mean": 0.79296875,
"rewards/accuracy_reward/std": 0.40597182512283325,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1801.0,
"completions/mean_length": 691.62109375,
"completions/mean_terminated_length": 675.53759765625,
"completions/min_length": 208.0,
"completions/min_terminated_length": 208.0,
"epoch": 0.960256068284876,
"grad_norm": 0.22328275656543126,
"learning_rate": 5.5845868874357385e-09,
"loss": 0.0216,
"num_tokens": 47660813.0,
"reward": 0.63671875,
"reward_std": 0.2789405584335327,
"rewards/accuracy_reward/mean": 0.63671875,
"rewards/accuracy_reward/std": 0.48188701272010803,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1832.0,
"completions/mean_length": 687.15234375,
"completions/mean_terminated_length": 648.8955688476562,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.9645238730328087,
"grad_norm": 0.1940699434225098,
"learning_rate": 4.5251191160326495e-09,
"loss": 0.0162,
"num_tokens": 47875420.0,
"reward": 0.6328125,
"reward_std": 0.229887455701828,
"rewards/accuracy_reward/mean": 0.6328125,
"rewards/accuracy_reward/std": 0.48298248648643494,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 226
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 632.83203125,
"completions/mean_terminated_length": 604.6414794921875,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.9687916777807415,
"grad_norm": 0.14202828475012133,
"learning_rate": 3.5765368290813223e-09,
"loss": 0.0051,
"num_tokens": 48070545.0,
"reward": 0.73046875,
"reward_std": 0.15952971577644348,
"rewards/accuracy_reward/mean": 0.73046875,
"rewards/accuracy_reward/std": 0.44458550214767456,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 227
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1917.0,
"completions/mean_length": 783.6953125,
"completions/mean_terminated_length": 742.9112548828125,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.9730594825286744,
"grad_norm": 1.825730745371174,
"learning_rate": 2.739052315863355e-09,
"loss": 0.0126,
"num_tokens": 48308835.0,
"reward": 0.53515625,
"reward_std": 0.2778797447681427,
"rewards/accuracy_reward/mean": 0.53515625,
"rewards/accuracy_reward/std": 0.49973952770233154,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1901.0,
"completions/mean_length": 572.68359375,
"completions/mean_terminated_length": 561.0669555664062,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.9773272872766071,
"grad_norm": 0.17942667026211423,
"learning_rate": 2.0128530023804656e-09,
"loss": 0.0248,
"num_tokens": 48489730.0,
"reward": 0.73828125,
"reward_std": 0.18911069631576538,
"rewards/accuracy_reward/mean": 0.73828125,
"rewards/accuracy_reward/std": 0.4404313564300537,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 229
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1728.0,
"completions/mean_length": 736.47265625,
"completions/mean_terminated_length": 688.6842041015625,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.9815950920245399,
"grad_norm": 0.11589896012509639,
"learning_rate": 1.3981014094099353e-09,
"loss": 0.004,
"num_tokens": 48727051.0,
"reward": 0.55859375,
"reward_std": 0.14256632328033447,
"rewards/accuracy_reward/mean": 0.55859375,
"rewards/accuracy_reward/std": 0.4975275993347168,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 731.9765625,
"completions/mean_terminated_length": 672.8897705078125,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.9858628967724726,
"grad_norm": 0.17379082291728887,
"learning_rate": 8.949351161324225e-10,
"loss": 0.0287,
"num_tokens": 48951677.0,
"reward": 0.6328125,
"reward_std": 0.21686306595802307,
"rewards/accuracy_reward/mean": 0.6328125,
"rewards/accuracy_reward/std": 0.48298248648643494,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 231
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1865.0,
"completions/mean_length": 613.2109375,
"completions/mean_terminated_length": 590.4365234375,
"completions/min_length": 145.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.9901307015204054,
"grad_norm": 0.14136371185185287,
"learning_rate": 5.034667293427053e-10,
"loss": 0.0157,
"num_tokens": 49140963.0,
"reward": 0.62109375,
"reward_std": 0.15057817101478577,
"rewards/accuracy_reward/mean": 0.62109375,
"rewards/accuracy_reward/std": 0.4860650300979614,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2008.0,
"completions/mean_length": 718.78515625,
"completions/mean_terminated_length": 681.4176635742188,
"completions/min_length": 168.0,
"completions/min_terminated_length": 168.0,
"epoch": 0.9943985062683383,
"grad_norm": 0.15367312232291866,
"learning_rate": 2.2378385824833866e-10,
"loss": 0.0459,
"num_tokens": 49355428.0,
"reward": 0.671875,
"reward_std": 0.15558436512947083,
"rewards/accuracy_reward/mean": 0.671875,
"rewards/accuracy_reward/std": 0.47045037150382996,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 233
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1743.0,
"completions/max_terminated_length": 1743.0,
"completions/mean_length": 515.4459838867188,
"completions/mean_terminated_length": 515.4459838867188,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"epoch": 0.998666311016271,
"grad_norm": 0.16966303678970307,
"learning_rate": 5.594909486328348e-11,
"loss": 0.0063,
"num_tokens": 49525500.0,
"reward": 0.7890625,
"reward_std": 0.2095002979040146,
"rewards/accuracy_reward/mean": 0.7890625,
"rewards/accuracy_reward/std": 0.4087733030319214,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 234
},
{
"epoch": 0.998666311016271,
"step": 234,
"total_flos": 0.0,
"train_loss": 0.015861935917542785,
"train_runtime": 47487.0739,
"train_samples_per_second": 0.158,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 234,
"num_input_tokens_seen": 49525500,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}