Files
OpenRS-GRPO-S/trainer_state.json
ModelHub XC d2853dd1bb 初始化项目,由ModelHub XC社区提供模型
Model: mimoidochi/OpenRS-GRPO-S
Source: Original Platform
2026-04-30 05:08:48 +08:00

4943 lines
152 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"eval_steps": 500,
"global_step": 350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 3134.95849609375,
"epoch": 0.0005714285714285715,
"grad_norm": 0.6937011480331421,
"kl": 0.0,
"learning_rate": 2e-08,
"loss": -0.7208,
"reward": 0.27500003203749657,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.2916666679084301,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 2868.9583740234375,
"epoch": 0.001142857142857143,
"grad_norm": 1.1340324878692627,
"kl": 0.0,
"learning_rate": 4e-08,
"loss": -0.7386,
"reward": 0.27500003576278687,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.4583333432674408,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 3083.70849609375,
"epoch": 0.0017142857142857142,
"grad_norm": 1.927505373954773,
"kl": 1.7076730728149414e-05,
"learning_rate": 6e-08,
"loss": -2.292,
"reward": 0.3500000163912773,
"reward_std": 0.40609321743249893,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.3333333358168602,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 2651.791748046875,
"epoch": 0.002285714285714286,
"grad_norm": 2.0249767303466797,
"kl": 3.701448440551758e-05,
"learning_rate": 8e-08,
"loss": -2.4107,
"reward": 0.40000002086162567,
"reward_std": 0.27739381790161133,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.5833333432674408,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2761.916748046875,
"epoch": 0.002857142857142857,
"grad_norm": 2.0101120471954346,
"kl": 3.8623809814453125e-05,
"learning_rate": 1e-07,
"loss": -0.725,
"reward": 0.17500000819563866,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.2916666679084301,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 3023.8333740234375,
"epoch": 0.0034285714285714284,
"grad_norm": 0.3757542073726654,
"kl": 3.993511199951172e-05,
"learning_rate": 1.2e-07,
"loss": 0.0,
"reward": 0.15000000596046448,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 3147.75,
"epoch": 0.004,
"grad_norm": 1.2157344818115234,
"kl": 2.2083520889282227e-05,
"learning_rate": 1.4e-07,
"loss": -1.6438,
"reward": 0.15000000596046448,
"reward_std": 0.19993415474891663,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.2083333432674408,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 3028.1251220703125,
"epoch": 0.004571428571428572,
"grad_norm": 0.7169070243835449,
"kl": 3.1948089599609375e-05,
"learning_rate": 1.6e-07,
"loss": -0.9882,
"reward": 0.2750000059604645,
"reward_std": 0.14747881889343262,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.375,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2304.666748046875,
"epoch": 0.005142857142857143,
"grad_norm": 0.383722186088562,
"kl": 3.325939178466797e-05,
"learning_rate": 1.8e-07,
"loss": 0.0,
"reward": 0.4500000327825546,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.5,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 3193.625,
"epoch": 0.005714285714285714,
"grad_norm": 1.468209147453308,
"kl": 3.314018249511719e-05,
"learning_rate": 2e-07,
"loss": -1.6361,
"reward": 0.22500000894069672,
"reward_std": 0.22493848204612732,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.25,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2689.916748046875,
"epoch": 0.006285714285714286,
"grad_norm": 1.3963018655776978,
"kl": 5.7578086853027344e-05,
"learning_rate": 2.1999999999999998e-07,
"loss": -0.8156,
"reward": 0.2500000223517418,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.3333333358168602,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 2627.541748046875,
"epoch": 0.006857142857142857,
"grad_norm": 1.4502061605453491,
"kl": 3.600120544433594e-05,
"learning_rate": 2.4e-07,
"loss": -2.2199,
"reward": 0.5500000715255737,
"reward_std": 0.3433297872543335,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/format_reward": 0.6250000298023224,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 2780.58349609375,
"epoch": 0.0074285714285714285,
"grad_norm": 0.4417027235031128,
"kl": 1.5676021575927734e-05,
"learning_rate": 2.6e-07,
"loss": 0.0,
"reward": 0.45000001788139343,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 2180.0,
"epoch": 0.008,
"grad_norm": 1.337963342666626,
"kl": 4.553794860839844e-05,
"learning_rate": 2.8e-07,
"loss": -1.6775,
"reward": 0.7000000476837158,
"reward_std": 0.19993416219949722,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.7083333730697632,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 2745.791748046875,
"epoch": 0.008571428571428572,
"grad_norm": 2.005941390991211,
"kl": 3.337860107421875e-05,
"learning_rate": 3e-07,
"loss": -1.7196,
"reward": 0.2500000149011612,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.4166666716337204,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 2953.541748046875,
"epoch": 0.009142857142857144,
"grad_norm": 1.1998287439346313,
"kl": 4.00543212890625e-05,
"learning_rate": 3.2e-07,
"loss": -1.4557,
"reward": 0.42500001192092896,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.5000000298023224,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 3183.916748046875,
"epoch": 0.009714285714285713,
"grad_norm": 3.6435375213623047,
"kl": 5.614757537841797e-05,
"learning_rate": 3.4000000000000003e-07,
"loss": -2.4555,
"reward": 0.17500000447034836,
"reward_std": 0.28209254145622253,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.2500000111758709,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 3134.75,
"epoch": 0.010285714285714285,
"grad_norm": 0.2575957179069519,
"kl": 5.364418029785156e-05,
"learning_rate": 3.6e-07,
"loss": 0.0,
"reward": 0.15000000596046448,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 2953.791748046875,
"epoch": 0.010857142857142857,
"grad_norm": 1.4193103313446045,
"kl": 5.936622619628906e-05,
"learning_rate": 3.7999999999999996e-07,
"loss": -0.7348,
"reward": 0.30000003799796104,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.2916666679084301,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 1856.7083740234375,
"epoch": 0.011428571428571429,
"grad_norm": 0.3330537974834442,
"kl": 2.3543834686279297e-05,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": 0.5500000417232513,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.75,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1932.2501220703125,
"epoch": 0.012,
"grad_norm": 2.0459184646606445,
"kl": 3.30805778503418e-05,
"learning_rate": 4.1999999999999995e-07,
"loss": -1.7473,
"reward": 0.5250000357627869,
"reward_std": 0.24647516012191772,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.75,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 3040.416748046875,
"epoch": 0.012571428571428572,
"grad_norm": 2.663806676864624,
"kl": 4.1604042053222656e-05,
"learning_rate": 4.3999999999999997e-07,
"loss": -3.5218,
"reward": 0.6000000238418579,
"reward_std": 0.5641850829124451,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.5416666865348816,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 2432.791748046875,
"epoch": 0.013142857142857144,
"grad_norm": 1.2767139673233032,
"kl": 4.3511390686035156e-05,
"learning_rate": 4.6e-07,
"loss": -0.9799,
"reward": 0.4500000327825546,
"reward_std": 0.20871607959270477,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.625,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 2564.2083740234375,
"epoch": 0.013714285714285714,
"grad_norm": 0.6577972173690796,
"kl": 4.863739013671875e-05,
"learning_rate": 4.8e-07,
"loss": -0.7097,
"reward": 0.5250000357627869,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/format_reward": 0.5416666865348816,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1678.4167175292969,
"epoch": 0.014285714285714285,
"grad_norm": 0.5228314995765686,
"kl": 4.172325134277344e-05,
"learning_rate": 5e-07,
"loss": -0.4805,
"reward": 0.6250000298023224,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.7916666865348816,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 2865.6251220703125,
"epoch": 0.014857142857142857,
"grad_norm": 1.137710690498352,
"kl": 5.5909156799316406e-05,
"learning_rate": 5.2e-07,
"loss": -0.9844,
"reward": 0.3750000149011612,
"reward_std": 0.24647516012191772,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.375,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 3516.375,
"epoch": 0.015428571428571429,
"grad_norm": 2.815162420272827,
"kl": 7.021427154541016e-05,
"learning_rate": 5.4e-07,
"loss": -2.2754,
"reward": 0.1250000111758709,
"reward_std": 0.2479735016822815,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.1666666679084301,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 2613.416748046875,
"epoch": 0.016,
"grad_norm": 1.428934931755066,
"kl": 7.724761962890625e-05,
"learning_rate": 5.6e-07,
"loss": -1.9051,
"reward": 0.42500002682209015,
"reward_std": 0.323934830725193,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/format_reward": 0.5,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 2780.9583740234375,
"epoch": 0.01657142857142857,
"grad_norm": 1.547488808631897,
"kl": 7.271766662597656e-05,
"learning_rate": 5.8e-07,
"loss": -1.3492,
"reward": 0.30000003427267075,
"reward_std": 0.32240864634513855,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.3333333544433117,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 3566.375,
"epoch": 0.017142857142857144,
"grad_norm": 1.6621514558792114,
"kl": 0.000118255615234375,
"learning_rate": 6e-07,
"loss": -1.2378,
"reward": 0.15000000596046448,
"reward_std": 0.27739381790161133,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.125,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2331.5001220703125,
"epoch": 0.017714285714285714,
"grad_norm": 1.5594860315322876,
"kl": 0.00010943412780761719,
"learning_rate": 6.2e-07,
"loss": -1.4703,
"reward": 0.40000002086162567,
"reward_std": 0.2773938253521919,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.458333358168602,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 2168.8751220703125,
"epoch": 0.018285714285714287,
"grad_norm": 0.7033916711807251,
"kl": 0.00013780593872070312,
"learning_rate": 6.4e-07,
"loss": -0.678,
"reward": 0.6250000298023224,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5416666865348816,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.018857142857142857,
"grad_norm": 0.45104363560676575,
"kl": 0.0001506805419921875,
"learning_rate": 6.6e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 2776.4583740234375,
"epoch": 0.019428571428571427,
"grad_norm": 0.6990403532981873,
"kl": 0.00020122528076171875,
"learning_rate": 6.800000000000001e-07,
"loss": -0.9026,
"reward": 0.5500000044703484,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.5833333358168602,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 3161.8333740234375,
"epoch": 0.02,
"grad_norm": 2.173389434814453,
"kl": 0.0002646446228027344,
"learning_rate": 7e-07,
"loss": -1.6205,
"reward": 0.17500000447034836,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.291666679084301,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 2875.33349609375,
"epoch": 0.02057142857142857,
"grad_norm": 2.574173927307129,
"kl": 0.0003509521484375,
"learning_rate": 7.2e-07,
"loss": -2.5022,
"reward": 0.30000001192092896,
"reward_std": 0.36425092816352844,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.3333333432674408,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 3408.8333740234375,
"epoch": 0.021142857142857144,
"grad_norm": 1.749144434928894,
"kl": 0.000339508056640625,
"learning_rate": 7.4e-07,
"loss": -2.3526,
"reward": 0.2500000149011612,
"reward_std": 0.40926575660705566,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/format_reward": 0.2083333358168602,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 2987.5001220703125,
"epoch": 0.021714285714285714,
"grad_norm": 1.8244866132736206,
"kl": 0.000728607177734375,
"learning_rate": 7.599999999999999e-07,
"loss": -1.8345,
"reward": 0.30000001192092896,
"reward_std": 0.2683281749486923,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.3333333432674408,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1947.7083740234375,
"epoch": 0.022285714285714287,
"grad_norm": 1.6700297594070435,
"kl": 0.000614166259765625,
"learning_rate": 7.799999999999999e-07,
"loss": -2.2225,
"reward": 0.8000000715255737,
"reward_std": 0.24494898319244385,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.8750000298023224,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 3358.75,
"epoch": 0.022857142857142857,
"grad_norm": 0.8734477758407593,
"kl": 0.000514984130859375,
"learning_rate": 8e-07,
"loss": -1.4618,
"reward": 0.10000000894069672,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.1666666716337204,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 2962.7083740234375,
"epoch": 0.023428571428571427,
"grad_norm": 1.2087616920471191,
"kl": 0.00079345703125,
"learning_rate": 8.199999999999999e-07,
"loss": -0.7151,
"reward": 0.32500001788139343,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.2916666865348816,
"rewards/format_reward": 0.25,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2246.0834350585938,
"epoch": 0.024,
"grad_norm": 0.39726927876472473,
"kl": 0.0006685256958007812,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0001,
"reward": 0.3500000238418579,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.5,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 3212.7501220703125,
"epoch": 0.02457142857142857,
"grad_norm": 1.559288740158081,
"kl": 0.000598907470703125,
"learning_rate": 8.599999999999999e-07,
"loss": -1.7896,
"reward": 0.20000001788139343,
"reward_std": 0.2323790118098259,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.2500000074505806,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 3197.5001220703125,
"epoch": 0.025142857142857144,
"grad_norm": 1.7089260816574097,
"kl": 0.0013065338134765625,
"learning_rate": 8.799999999999999e-07,
"loss": -2.2349,
"reward": 0.40000003576278687,
"reward_std": 0.36088940501213074,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/format_reward": 0.3750000149011612,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 3498.95849609375,
"epoch": 0.025714285714285714,
"grad_norm": 1.1292223930358887,
"kl": 0.0009365081787109375,
"learning_rate": 9e-07,
"loss": -1.8457,
"reward": 0.1250000074505806,
"reward_std": 0.1596180573105812,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.2083333358168602,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 2446.4583740234375,
"epoch": 0.026285714285714287,
"grad_norm": 0.8362674713134766,
"kl": 0.002166748046875,
"learning_rate": 9.2e-07,
"loss": -0.9237,
"reward": 0.5500000417232513,
"reward_std": 0.21162375062704086,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.6666666865348816,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 2658.0,
"epoch": 0.026857142857142857,
"grad_norm": 1.1083022356033325,
"kl": 0.001728057861328125,
"learning_rate": 9.399999999999999e-07,
"loss": -1.7675,
"reward": 0.5250000059604645,
"reward_std": 0.3030136823654175,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.5416666865348816,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1924.25,
"epoch": 0.027428571428571427,
"grad_norm": 1.7161272764205933,
"kl": 0.01171112060546875,
"learning_rate": 9.6e-07,
"loss": 0.0019,
"reward": 0.6000000089406967,
"reward_std": 0.22085529565811157,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.75,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2205.7500610351562,
"epoch": 0.028,
"grad_norm": 1.093712329864502,
"kl": 0.003936767578125,
"learning_rate": 9.8e-07,
"loss": -0.6857,
"reward": 0.4000000022351742,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.5416666679084301,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 3085.291748046875,
"epoch": 0.02857142857142857,
"grad_norm": 1.334791660308838,
"kl": 0.00530242919921875,
"learning_rate": 1e-06,
"loss": -2.762,
"reward": 0.2750000134110451,
"reward_std": 0.39242780208587646,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.291666679084301,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 2834.2083740234375,
"epoch": 0.029142857142857144,
"grad_norm": 0.6573655605316162,
"kl": 0.0026569366455078125,
"learning_rate": 9.999890338174275e-07,
"loss": 0.0004,
"reward": 0.27500003576278687,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.25,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 3296.791748046875,
"epoch": 0.029714285714285714,
"grad_norm": 1.4219310283660889,
"kl": 0.002716064453125,
"learning_rate": 9.999561358041868e-07,
"loss": -2.8224,
"reward": 0.40000003576278687,
"reward_std": 0.38455653190612793,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.4166666716337204,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 2921.291748046875,
"epoch": 0.030285714285714287,
"grad_norm": 1.2512001991271973,
"kl": 0.0057525634765625,
"learning_rate": 9.999013075636804e-07,
"loss": 0.0009,
"reward": 0.25,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.25,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 2937.875,
"epoch": 0.030857142857142857,
"grad_norm": 0.24358271062374115,
"kl": 0.00394439697265625,
"learning_rate": 9.998245517681593e-07,
"loss": 0.0006,
"reward": 0.15000000596046448,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 3225.041748046875,
"epoch": 0.03142857142857143,
"grad_norm": 0.5058190226554871,
"kl": 0.0050811767578125,
"learning_rate": 9.997258721585931e-07,
"loss": -0.4553,
"reward": 0.30000001192092896,
"reward_std": 0.19993415474891663,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.2916666865348816,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 2791.33349609375,
"epoch": 0.032,
"grad_norm": 1.2000298500061035,
"kl": 0.0060272216796875,
"learning_rate": 9.996052735444862e-07,
"loss": -1.9483,
"reward": 0.30000003799796104,
"reward_std": 0.2173428237438202,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.4583333544433117,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 2855.5,
"epoch": 0.03257142857142857,
"grad_norm": 2.375877618789673,
"kl": 0.007659912109375,
"learning_rate": 9.994627618036452e-07,
"loss": -3.1453,
"reward": 0.40000002086162567,
"reward_std": 0.4469800293445587,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.5000000149011612,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 3482.75,
"epoch": 0.03314285714285714,
"grad_norm": 1.0685774087905884,
"kl": 0.005950927734375,
"learning_rate": 9.992983438818915e-07,
"loss": -1.8528,
"reward": 0.1250000111758709,
"reward_std": 0.2479735016822815,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.1666666679084301,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 2283.0834350585938,
"epoch": 0.03371428571428572,
"grad_norm": 1.0396032333374023,
"kl": 0.008758544921875,
"learning_rate": 9.991120277927223e-07,
"loss": -1.6818,
"reward": 0.42500001192092896,
"reward_std": 0.21615658700466156,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.6250000298023224,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 3297.45849609375,
"epoch": 0.03428571428571429,
"grad_norm": 1.2367874383926392,
"kl": 0.006866455078125,
"learning_rate": 9.989038226169207e-07,
"loss": -3.072,
"reward": 0.27500002086162567,
"reward_std": 0.3798578232526779,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.3750000149011612,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 2158.541748046875,
"epoch": 0.03485714285714286,
"grad_norm": 0.9891504049301147,
"kl": 0.0059051513671875,
"learning_rate": 9.98673738502114e-07,
"loss": -1.8628,
"reward": 0.7249999940395355,
"reward_std": 0.4160907417535782,
"rewards/accuracy_reward": 0.5000000298023224,
"rewards/format_reward": 0.7083333432674408,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 2971.291748046875,
"epoch": 0.03542857142857143,
"grad_norm": 0.7076906561851501,
"kl": 0.004241943359375,
"learning_rate": 9.98421786662277e-07,
"loss": -0.5466,
"reward": 0.17500000819563866,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.2916666679084301,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 3438.25,
"epoch": 0.036,
"grad_norm": 0.8365570902824402,
"kl": 0.0075531005859375,
"learning_rate": 9.981479793771866e-07,
"loss": -1.6168,
"reward": 0.17500000447034836,
"reward_std": 0.26995331048965454,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.1666666679084301,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 3380.7083740234375,
"epoch": 0.036571428571428574,
"grad_norm": 0.727074384689331,
"kl": 0.00412750244140625,
"learning_rate": 9.97852329991824e-07,
"loss": -0.9846,
"reward": 0.07500000298023224,
"reward_std": 0.08215838670730591,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.125,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 3198.916748046875,
"epoch": 0.037142857142857144,
"grad_norm": 0.9209883809089661,
"kl": 0.0122222900390625,
"learning_rate": 9.975348529157229e-07,
"loss": -1.6992,
"reward": 0.22500000894069672,
"reward_std": 0.21632246673107147,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.2916666716337204,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 2440.08349609375,
"epoch": 0.037714285714285714,
"grad_norm": 1.0546424388885498,
"kl": 0.00701904296875,
"learning_rate": 9.971955636222684e-07,
"loss": -2.3799,
"reward": 0.550000011920929,
"reward_std": 0.3548535108566284,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.7500000298023224,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 2116.5,
"epoch": 0.038285714285714284,
"grad_norm": 1.3037738800048828,
"kl": 0.01519775390625,
"learning_rate": 9.968344786479415e-07,
"loss": -1.6571,
"reward": 0.45000001788139343,
"reward_std": 0.24978766590356827,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.625,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 3452.666748046875,
"epoch": 0.038857142857142854,
"grad_norm": 0.39050254225730896,
"kl": 0.00258636474609375,
"learning_rate": 9.964516155915151e-07,
"loss": -0.6092,
"reward": 0.07500000298023224,
"reward_std": 0.12549901008605957,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.0833333358168602,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 3062.0,
"epoch": 0.03942857142857143,
"grad_norm": 0.89232337474823,
"kl": 0.00434112548828125,
"learning_rate": 9.960469931131936e-07,
"loss": -1.2788,
"reward": 0.30000000447034836,
"reward_std": 0.2658701241016388,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.3333333544433117,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 2577.166748046875,
"epoch": 0.04,
"grad_norm": 1.3402986526489258,
"kl": 0.0091552734375,
"learning_rate": 9.956206309337066e-07,
"loss": -1.5284,
"reward": 0.42500000447034836,
"reward_std": 0.20463287830352783,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/format_reward": 0.4166666679084301,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 1986.875,
"epoch": 0.04057142857142857,
"grad_norm": 1.0082634687423706,
"kl": 0.004364013671875,
"learning_rate": 9.951725498333448e-07,
"loss": -1.6703,
"reward": 0.6500000059604645,
"reward_std": 0.19993415474891663,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.625,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 2902.125,
"epoch": 0.04114285714285714,
"grad_norm": 0.5259437561035156,
"kl": 0.008880615234375,
"learning_rate": 9.947027716509488e-07,
"loss": -0.8052,
"reward": 0.3500000163912773,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.3333333358168602,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 2200.7500610351562,
"epoch": 0.04171428571428572,
"grad_norm": 0.7980517745018005,
"kl": 0.01031494140625,
"learning_rate": 9.942113192828444e-07,
"loss": -0.9314,
"reward": 0.5,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.6666666865348816,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 2764.2083740234375,
"epoch": 0.04228571428571429,
"grad_norm": 1.3613859415054321,
"kl": 0.00328826904296875,
"learning_rate": 9.93698216681727e-07,
"loss": -1.641,
"reward": 0.3500000163912773,
"reward_std": 0.28679126501083374,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.4166666679084301,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 2020.166748046875,
"epoch": 0.04285714285714286,
"grad_norm": 1.677713394165039,
"kl": 0.015411376953125,
"learning_rate": 9.931634888554935e-07,
"loss": -1.892,
"reward": 0.5500000715255737,
"reward_std": 0.29662763327360153,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.7916666865348816,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 3099.291748046875,
"epoch": 0.04342857142857143,
"grad_norm": 0.7256157398223877,
"kl": 0.0051727294921875,
"learning_rate": 9.926071618660237e-07,
"loss": -1.579,
"reward": 0.2750000059604645,
"reward_std": 0.22555401921272278,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.3333333432674408,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 3111.20849609375,
"epoch": 0.044,
"grad_norm": 0.5698821544647217,
"kl": 0.011016845703125,
"learning_rate": 9.9202926282791e-07,
"loss": 0.0018,
"reward": 0.42500004172325134,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.4583333432674408,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 2611.541748046875,
"epoch": 0.044571428571428574,
"grad_norm": 1.2385872602462769,
"kl": 0.009613037109375,
"learning_rate": 9.91429819907136e-07,
"loss": -2.4486,
"reward": 0.5750000178813934,
"reward_std": 0.42487265169620514,
"rewards/accuracy_reward": 0.4166666716337204,
"rewards/format_reward": 0.5416666716337204,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 2208.3751220703125,
"epoch": 0.045142857142857144,
"grad_norm": 0.9444563388824463,
"kl": 0.011566162109375,
"learning_rate": 9.908088623197048e-07,
"loss": -2.1262,
"reward": 0.550000011920929,
"reward_std": 0.29662764072418213,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.6666666865348816,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 3514.791748046875,
"epoch": 0.045714285714285714,
"grad_norm": 0.6574345231056213,
"kl": 0.0064697265625,
"learning_rate": 9.901664203302124e-07,
"loss": -0.9746,
"reward": 0.07500000298023224,
"reward_std": 0.08215838670730591,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.125,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 2035.7501220703125,
"epoch": 0.046285714285714284,
"grad_norm": 1.1667371988296509,
"kl": 0.02191162109375,
"learning_rate": 9.895025252503755e-07,
"loss": -1.7981,
"reward": 0.6500000059604645,
"reward_std": 0.28908342123031616,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.8333333730697632,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 2879.2083740234375,
"epoch": 0.046857142857142854,
"grad_norm": 0.5121353268623352,
"kl": 0.009796142578125,
"learning_rate": 9.888172094375033e-07,
"loss": -0.8266,
"reward": 0.25,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.3333333432674408,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1211.4166870117188,
"epoch": 0.04742857142857143,
"grad_norm": 0.9632378816604614,
"kl": 0.01214599609375,
"learning_rate": 9.881105062929221e-07,
"loss": -0.5763,
"reward": 0.7250000536441803,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.4166666716337204,
"rewards/format_reward": 0.7916666865348816,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 1922.916748046875,
"epoch": 0.048,
"grad_norm": 1.0117182731628418,
"kl": 0.009674072265625,
"learning_rate": 9.873824502603459e-07,
"loss": -0.7207,
"reward": 0.6000000536441803,
"reward_std": 0.27253396064043045,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/format_reward": 0.7083333432674408,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 2529.875,
"epoch": 0.04857142857142857,
"grad_norm": 1.2297358512878418,
"kl": 0.023101806640625,
"learning_rate": 9.866330768241983e-07,
"loss": -1.8915,
"reward": 0.4000000059604645,
"reward_std": 0.22963720560073853,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.5416666865348816,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 2450.291748046875,
"epoch": 0.04914285714285714,
"grad_norm": 0.9087566137313843,
"kl": 0.014007568359375,
"learning_rate": 9.85862422507884e-07,
"loss": -1.8448,
"reward": 0.8500000536441803,
"reward_std": 0.464758038520813,
"rewards/accuracy_reward": 0.5833333432674408,
"rewards/format_reward": 0.8333333730697632,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 2870.5,
"epoch": 0.04971428571428571,
"grad_norm": 0.9272903800010681,
"kl": 0.01702880859375,
"learning_rate": 9.850705248720068e-07,
"loss": -0.4444,
"reward": 0.32500001415610313,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.2916666679084301,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 2377.8333740234375,
"epoch": 0.05028571428571429,
"grad_norm": 0.9736530184745789,
"kl": 0.013763427734375,
"learning_rate": 9.8425742251254e-07,
"loss": -1.3592,
"reward": 0.45000001788139343,
"reward_std": 0.30921074748039246,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.6250000298023224,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 2418.791748046875,
"epoch": 0.05085714285714286,
"grad_norm": 0.9435445666313171,
"kl": 0.023223876953125,
"learning_rate": 9.83423155058946e-07,
"loss": -1.928,
"reward": 0.3500000238418579,
"reward_std": 0.26902148127555847,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.4583333432674408,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 2293.8751220703125,
"epoch": 0.05142857142857143,
"grad_norm": 1.0847179889678955,
"kl": 0.0203857421875,
"learning_rate": 9.825677631722435e-07,
"loss": -1.8569,
"reward": 0.8250000774860382,
"reward_std": 0.42503853142261505,
"rewards/accuracy_reward": 0.5416666865348816,
"rewards/format_reward": 0.8333333730697632,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 3049.3751220703125,
"epoch": 0.052,
"grad_norm": 0.5584085583686829,
"kl": 0.016754150390625,
"learning_rate": 9.816912885430258e-07,
"loss": -0.8024,
"reward": 0.40000002086162567,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.4166666716337204,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 2802.0,
"epoch": 0.052571428571428575,
"grad_norm": 0.9023773670196533,
"kl": 0.0162353515625,
"learning_rate": 9.807937738894303e-07,
"loss": 0.0026,
"reward": 0.30000001192092896,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.25,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 2122.5,
"epoch": 0.053142857142857144,
"grad_norm": 1.0342284440994263,
"kl": 0.016387939453125,
"learning_rate": 9.798752629550546e-07,
"loss": 0.0026,
"reward": 0.42500004172325134,
"reward_std": 0.13869690895080566,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.5,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 2639.2501220703125,
"epoch": 0.053714285714285714,
"grad_norm": 0.929490864276886,
"kl": 0.014007568359375,
"learning_rate": 9.78935800506826e-07,
"loss": -2.0397,
"reward": 0.5250000059604645,
"reward_std": 0.2479735016822815,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.7083333730697632,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 1750.0833740234375,
"epoch": 0.054285714285714284,
"grad_norm": 1.3795406818389893,
"kl": 0.01446533203125,
"learning_rate": 9.779754323328192e-07,
"loss": -2.9239,
"reward": 0.675000011920929,
"reward_std": 0.39452049136161804,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/format_reward": 0.8333333730697632,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 3076.70849609375,
"epoch": 0.054857142857142854,
"grad_norm": 0.8867998123168945,
"kl": 0.01177978515625,
"learning_rate": 9.769942052400235e-07,
"loss": -1.3267,
"reward": 0.2500000223517418,
"reward_std": 0.24494898319244385,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.3333333544433117,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 2068.1250610351562,
"epoch": 0.05542857142857143,
"grad_norm": 1.0344054698944092,
"kl": 0.0184326171875,
"learning_rate": 9.759921670520634e-07,
"loss": -0.6738,
"reward": 0.3749999962747097,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.5416666679084301,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1832.9584350585938,
"epoch": 0.056,
"grad_norm": 0.7039546966552734,
"kl": 0.0093536376953125,
"learning_rate": 9.749693666068663e-07,
"loss": -1.7297,
"reward": 0.7250000536441803,
"reward_std": 0.31179559230804443,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.75,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 2053.041748046875,
"epoch": 0.05657142857142857,
"grad_norm": 1.1954854726791382,
"kl": 0.008026123046875,
"learning_rate": 9.739258537542835e-07,
"loss": -1.1949,
"reward": 0.6500000357627869,
"reward_std": 0.3548534959554672,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.6250000298023224,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 1900.1250610351562,
"epoch": 0.05714285714285714,
"grad_norm": 0.692438542842865,
"kl": 0.008209228515625,
"learning_rate": 9.728616793536587e-07,
"loss": -0.8441,
"reward": 0.6750000715255737,
"reward_std": 0.26995329558849335,
"rewards/accuracy_reward": 0.5000000298023224,
"rewards/format_reward": 0.625,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 2909.0833740234375,
"epoch": 0.05771428571428571,
"grad_norm": 0.7268219590187073,
"kl": 0.0091552734375,
"learning_rate": 9.717768952713511e-07,
"loss": -1.6243,
"reward": 0.30000001192092896,
"reward_std": 0.2323790192604065,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.3333333432674408,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 2342.25,
"epoch": 0.05828571428571429,
"grad_norm": 1.311353087425232,
"kl": 0.0206298828125,
"learning_rate": 9.706715543782064e-07,
"loss": -2.3993,
"reward": 0.42500004172325134,
"reward_std": 0.3704479932785034,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.5,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 2729.7916870117188,
"epoch": 0.05885714285714286,
"grad_norm": 1.082575798034668,
"kl": 0.0152587890625,
"learning_rate": 9.695457105469804e-07,
"loss": -2.3919,
"reward": 0.2500000111758709,
"reward_std": 0.2861757278442383,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.2916666679084301,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 2986.5833740234375,
"epoch": 0.05942857142857143,
"grad_norm": 0.5590086579322815,
"kl": 0.011871337890625,
"learning_rate": 9.683994186497132e-07,
"loss": -0.5574,
"reward": 0.30000003427267075,
"reward_std": 0.19993416219949722,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.2916666679084301,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 2627.33349609375,
"epoch": 0.06,
"grad_norm": 0.7722799777984619,
"kl": 0.02337646484375,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0037,
"reward": 0.32500001788139343,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.5,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 2862.666748046875,
"epoch": 0.060571428571428575,
"grad_norm": 0.46922969818115234,
"kl": 0.009735107421875,
"learning_rate": 9.66045715125541e-07,
"loss": -0.7062,
"reward": 0.27500003576278687,
"reward_std": 0.20463287830352783,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.2916666865348816,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 2396.3750610351562,
"epoch": 0.061142857142857145,
"grad_norm": 0.8894286155700684,
"kl": 0.01702880859375,
"learning_rate": 9.648384182148252e-07,
"loss": -1.4031,
"reward": 0.3499999940395355,
"reward_std": 0.1741531491279602,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.5000000298023224,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 2107.0833740234375,
"epoch": 0.061714285714285715,
"grad_norm": 0.5590862035751343,
"kl": 0.0107421875,
"learning_rate": 9.636109026648554e-07,
"loss": -0.6467,
"reward": 0.32500000298023224,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5416666865348816,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 3016.291748046875,
"epoch": 0.062285714285714285,
"grad_norm": 0.4933023750782013,
"kl": 0.0123291015625,
"learning_rate": 9.623632283030077e-07,
"loss": 0.002,
"reward": 0.17499999701976776,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.25,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 2778.0001220703125,
"epoch": 0.06285714285714286,
"grad_norm": 0.7055474519729614,
"kl": 0.024871826171875,
"learning_rate": 9.610954559391704e-07,
"loss": -0.7942,
"reward": 0.2500000223517418,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.3333333358168602,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 2845.041748046875,
"epoch": 0.06342857142857143,
"grad_norm": 2.253298044204712,
"kl": 0.01910400390625,
"learning_rate": 9.598076473627796e-07,
"loss": 0.003,
"reward": 0.2250000238418579,
"reward_std": 0.08215838670730591,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.25,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 2531.916748046875,
"epoch": 0.064,
"grad_norm": 0.9246645569801331,
"kl": 0.015045166015625,
"learning_rate": 9.58499865339809e-07,
"loss": -2.463,
"reward": 0.3750000298023224,
"reward_std": 0.28209254145622253,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.5000000298023224,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 2310.4583740234375,
"epoch": 0.06457142857142857,
"grad_norm": 0.46783214807510376,
"kl": 0.01202392578125,
"learning_rate": 9.571721736097088e-07,
"loss": -0.7241,
"reward": 0.3750000149011612,
"reward_std": 0.17702671885490417,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.4583333432674408,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 3185.666748046875,
"epoch": 0.06514285714285714,
"grad_norm": 0.66249018907547,
"kl": 0.01556396484375,
"learning_rate": 9.55824636882301e-07,
"loss": -0.9383,
"reward": 0.10000000894069672,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.1666666716337204,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 2467.2501220703125,
"epoch": 0.06571428571428571,
"grad_norm": 0.9964971542358398,
"kl": 0.0130615234375,
"learning_rate": 9.54457320834625e-07,
"loss": -2.1875,
"reward": 0.32500001788139343,
"reward_std": 0.2611714005470276,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.4583333432674408,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 2428.5416870117188,
"epoch": 0.06628571428571428,
"grad_norm": 0.472149521112442,
"kl": 0.01055908203125,
"learning_rate": 9.530702921077358e-07,
"loss": 0.0017,
"reward": 0.32500001788139343,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.5,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 2962.041748046875,
"epoch": 0.06685714285714285,
"grad_norm": 1.044438362121582,
"kl": 0.0174560546875,
"learning_rate": 9.516636183034564e-07,
"loss": -2.2953,
"reward": 0.5999999940395355,
"reward_std": 0.44171059131622314,
"rewards/accuracy_reward": 0.5000000298023224,
"rewards/format_reward": 0.5000000298023224,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 1725.3333740234375,
"epoch": 0.06742857142857143,
"grad_norm": 0.6704514622688293,
"kl": 0.013427734375,
"learning_rate": 9.502373679810839e-07,
"loss": -0.7163,
"reward": 0.6250000298023224,
"reward_std": 0.11291590332984924,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.7083333432674408,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 2067.8333740234375,
"epoch": 0.068,
"grad_norm": 0.798595666885376,
"kl": 0.014404296875,
"learning_rate": 9.487916106540465e-07,
"loss": 0.0023,
"reward": 0.6000000238418579,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 2834.75,
"epoch": 0.06857142857142857,
"grad_norm": 0.5755884051322937,
"kl": 0.01953125,
"learning_rate": 9.473264167865171e-07,
"loss": -0.8146,
"reward": 0.25,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.3333333432674408,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 2840.291748046875,
"epoch": 0.06914285714285714,
"grad_norm": 0.5671223402023315,
"kl": 0.018829345703125,
"learning_rate": 9.458418577899774e-07,
"loss": -0.7081,
"reward": 0.2750000022351742,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.2916666679084301,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1911.1251220703125,
"epoch": 0.06971428571428571,
"grad_norm": 0.6135214567184448,
"kl": 0.014678955078125,
"learning_rate": 9.443380060197385e-07,
"loss": 0.0024,
"reward": 0.7750000357627869,
"reward_std": 0.13869690895080566,
"rewards/accuracy_reward": 0.5416666865348816,
"rewards/format_reward": 0.75,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 2615.7083740234375,
"epoch": 0.07028571428571428,
"grad_norm": 1.535902976989746,
"kl": 0.019317626953125,
"learning_rate": 9.428149347714143e-07,
"loss": -3.4492,
"reward": 0.5000000447034836,
"reward_std": 0.49277445673942566,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.5000000298023224,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 2879.6251220703125,
"epoch": 0.07085714285714285,
"grad_norm": 1.1739275455474854,
"kl": 0.016326904296875,
"learning_rate": 9.412727182773486e-07,
"loss": -2.466,
"reward": 0.5,
"reward_std": 0.22085529565811157,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5833333432674408,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 1907.7916870117188,
"epoch": 0.07142857142857142,
"grad_norm": 0.5758523344993591,
"kl": 0.016204833984375,
"learning_rate": 9.397114317029974e-07,
"loss": -0.8541,
"reward": 0.5750000476837158,
"reward_std": 0.19540132582187653,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/format_reward": 0.6666666716337204,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1292.2916870117188,
"epoch": 0.072,
"grad_norm": 0.6998382806777954,
"kl": 0.009063720703125,
"learning_rate": 9.381311511432658e-07,
"loss": -0.7275,
"reward": 0.7500000298023224,
"reward_std": 0.16431677341461182,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.875,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2668.916748046875,
"epoch": 0.07257142857142856,
"grad_norm": 0.6192981004714966,
"kl": 0.0121612548828125,
"learning_rate": 9.36531953618799e-07,
"loss": -0.9637,
"reward": 0.3750000149011612,
"reward_std": 0.08215838670730591,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.375,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 2330.33349609375,
"epoch": 0.07314285714285715,
"grad_norm": 0.850170373916626,
"kl": 0.0172271728515625,
"learning_rate": 9.34913917072228e-07,
"loss": -1.662,
"reward": 0.5500000417232513,
"reward_std": 0.1741531491279602,
"rewards/accuracy_reward": 0.2916666865348816,
"rewards/format_reward": 0.6250000298023224,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 1855.9166870117188,
"epoch": 0.07371428571428572,
"grad_norm": 0.6127024292945862,
"kl": 0.013641357421875,
"learning_rate": 9.332771203643714e-07,
"loss": -0.7422,
"reward": 0.5500000417232513,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.7083333432674408,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 1596.4583740234375,
"epoch": 0.07428571428571429,
"grad_norm": 0.6304543614387512,
"kl": 0.01348876953125,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0022,
"reward": 0.6500000655651093,
"reward_std": 0.24177644401788712,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/format_reward": 0.75,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 2883.291748046875,
"epoch": 0.07485714285714286,
"grad_norm": 0.4680456817150116,
"kl": 0.01495361328125,
"learning_rate": 9.299475664759068e-07,
"loss": 0.0024,
"reward": 0.15000000596046448,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.25,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 2198.375,
"epoch": 0.07542857142857143,
"grad_norm": 0.583177924156189,
"kl": 0.013580322265625,
"learning_rate": 9.282549715730579e-07,
"loss": -1.4613,
"reward": 0.5750000476837158,
"reward_std": 0.15610557794570923,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/format_reward": 0.6666666865348816,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 2586.6666870117188,
"epoch": 0.076,
"grad_norm": 0.4416232705116272,
"kl": 0.0152587890625,
"learning_rate": 9.265439410565328e-07,
"loss": -0.7425,
"reward": 0.4750000238418579,
"reward_std": 0.20463287830352783,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.4583333432674408,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 2783.4583740234375,
"epoch": 0.07657142857142857,
"grad_norm": 0.8242610096931458,
"kl": 0.01690673828125,
"learning_rate": 9.248145583195447e-07,
"loss": -1.5028,
"reward": 0.3500000163912773,
"reward_std": 0.24494898319244385,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.3750000223517418,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 2785.666748046875,
"epoch": 0.07714285714285714,
"grad_norm": 0.8124563097953796,
"kl": 0.02239990234375,
"learning_rate": 9.230669076497687e-07,
"loss": -0.6555,
"reward": 0.19999999925494194,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.2916666679084301,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1172.4583740234375,
"epoch": 0.07771428571428571,
"grad_norm": 1.0723152160644531,
"kl": 0.018798828125,
"learning_rate": 9.213010742252327e-07,
"loss": -0.9192,
"reward": 0.675000011920929,
"reward_std": 0.2611714005470276,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.875,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 2302.041748046875,
"epoch": 0.07828571428571429,
"grad_norm": 0.9255122542381287,
"kl": 0.0070953369140625,
"learning_rate": 9.195171441101668e-07,
"loss": -2.5899,
"reward": 0.42500001192092896,
"reward_std": 0.4010545611381531,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.5,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 2176.9583740234375,
"epoch": 0.07885714285714286,
"grad_norm": 0.5417965650558472,
"kl": 0.016082763671875,
"learning_rate": 9.177152042508077e-07,
"loss": 0.0026,
"reward": 0.45000001788139343,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 2479.8333740234375,
"epoch": 0.07942857142857143,
"grad_norm": 0.8476680517196655,
"kl": 0.019317626953125,
"learning_rate": 9.158953424711624e-07,
"loss": -1.9513,
"reward": 0.30000003799796104,
"reward_std": 0.2173428237438202,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.4583333544433117,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 1950.8334350585938,
"epoch": 0.08,
"grad_norm": 0.32817330956459045,
"kl": 0.0100860595703125,
"learning_rate": 9.140576474687263e-07,
"loss": -0.7432,
"reward": 0.5750000178813934,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.7083333432674408,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 1543.7500610351562,
"epoch": 0.08057142857142857,
"grad_norm": 0.5753952264785767,
"kl": 0.01080322265625,
"learning_rate": 9.122022088101613e-07,
"loss": 0.0017,
"reward": 0.4750000089406967,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.75,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 2146.2083740234375,
"epoch": 0.08114285714285714,
"grad_norm": 0.7607588171958923,
"kl": 0.01971435546875,
"learning_rate": 9.103291169269299e-07,
"loss": -1.4222,
"reward": 0.4750000238418579,
"reward_std": 0.280418336391449,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5416666865348816,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 2653.3751220703125,
"epoch": 0.08171428571428571,
"grad_norm": 1.6966173648834229,
"kl": 0.0238037109375,
"learning_rate": 9.084384631108882e-07,
"loss": -1.9226,
"reward": 0.45000001788139343,
"reward_std": 0.2773938328027725,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/format_reward": 0.4166666865348816,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 1880.166748046875,
"epoch": 0.08228571428571428,
"grad_norm": 1.1052340269088745,
"kl": 0.018768310546875,
"learning_rate": 9.065303395098358e-07,
"loss": -0.9172,
"reward": 0.7750000357627869,
"reward_std": 0.22493848204612732,
"rewards/accuracy_reward": 0.4166666716337204,
"rewards/format_reward": 0.875,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 886.25,
"epoch": 0.08285714285714285,
"grad_norm": 0.5123775601387024,
"kl": 0.016632080078125,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0027,
"reward": 0.6500000059604645,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 1.0,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 1539.8333435058594,
"epoch": 0.08342857142857144,
"grad_norm": 0.5727400779724121,
"kl": 0.01708984375,
"learning_rate": 9.026620557966279e-07,
"loss": 0.0027,
"reward": 0.6500000357627869,
"reward_std": 0.22085530310869217,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.75,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 1381.3750610351562,
"epoch": 0.084,
"grad_norm": 0.8086219429969788,
"kl": 0.01629638671875,
"learning_rate": 9.007020842191634e-07,
"loss": -1.4881,
"reward": 0.7750000357627869,
"reward_std": 0.31787581741809845,
"rewards/accuracy_reward": 0.4166666865348816,
"rewards/format_reward": 0.8750000298023224,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 1451.5,
"epoch": 0.08457142857142858,
"grad_norm": 0.8584139347076416,
"kl": 0.01898193359375,
"learning_rate": 8.987250199168808e-07,
"loss": -0.9353,
"reward": 0.550000011920929,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.8333333432674408,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 1420.4583740234375,
"epoch": 0.08514285714285715,
"grad_norm": 0.8129308223724365,
"kl": 0.014434814453125,
"learning_rate": 8.967309592491052e-07,
"loss": -1.3771,
"reward": 0.6000000536441803,
"reward_std": 0.19993415474891663,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.75,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 2162.666748046875,
"epoch": 0.08571428571428572,
"grad_norm": 0.889790952205658,
"kl": 0.02252197265625,
"learning_rate": 8.9471999940354e-07,
"loss": -0.58,
"reward": 0.42500004172325134,
"reward_std": 0.20463287830352783,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.5416666865348816,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 2218.7916870117188,
"epoch": 0.08628571428571429,
"grad_norm": 0.5901851654052734,
"kl": 0.013671875,
"learning_rate": 8.926922383915315e-07,
"loss": -0.7892,
"reward": 0.5500000268220901,
"reward_std": 0.22085529565811157,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.5833333432674408,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 2912.916748046875,
"epoch": 0.08685714285714285,
"grad_norm": 0.5517586469650269,
"kl": 0.021484375,
"learning_rate": 8.906477750432903e-07,
"loss": -0.891,
"reward": 0.3250000327825546,
"reward_std": 0.1596180573105812,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.3333333432674408,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 2069.916748046875,
"epoch": 0.08742857142857142,
"grad_norm": 0.6771623492240906,
"kl": 0.015380859375,
"learning_rate": 8.88586709003076e-07,
"loss": -1.2495,
"reward": 0.4750000238418579,
"reward_std": 0.15610557794570923,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/format_reward": 0.5,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 2867.5833740234375,
"epoch": 0.088,
"grad_norm": 0.6926563382148743,
"kl": 0.017333984375,
"learning_rate": 8.865091407243394e-07,
"loss": -1.5406,
"reward": 0.27500002086162567,
"reward_std": 0.2479735016822815,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.291666679084301,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 3541.0833740234375,
"epoch": 0.08857142857142856,
"grad_norm": 0.6008081436157227,
"kl": 0.01416015625,
"learning_rate": 8.844151714648274e-07,
"loss": -0.6494,
"reward": 0.07500000298023224,
"reward_std": 0.12549901008605957,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.0416666679084301,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1990.041748046875,
"epoch": 0.08914285714285715,
"grad_norm": 1.1144965887069702,
"kl": 0.01812744140625,
"learning_rate": 8.823049032816478e-07,
"loss": -2.2955,
"reward": 0.675000011920929,
"reward_std": 0.3254331648349762,
"rewards/accuracy_reward": 0.3750000149011612,
"rewards/format_reward": 0.7500000298023224,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 2252.8333740234375,
"epoch": 0.08971428571428572,
"grad_norm": 0.5644240379333496,
"kl": 0.012451171875,
"learning_rate": 8.801784390262943e-07,
"loss": 0.002,
"reward": 0.3750000298023224,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.5,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 1755.791748046875,
"epoch": 0.09028571428571429,
"grad_norm": 0.8909981846809387,
"kl": 0.0125732421875,
"learning_rate": 8.780358823396352e-07,
"loss": -1.3986,
"reward": 0.7000000476837158,
"reward_std": 0.38667041063308716,
"rewards/accuracy_reward": 0.3750000223517418,
"rewards/format_reward": 0.7916666865348816,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 3373.541748046875,
"epoch": 0.09085714285714286,
"grad_norm": 0.568712055683136,
"kl": 0.016937255859375,
"learning_rate": 8.758773376468604e-07,
"loss": -0.678,
"reward": 0.05000000447034836,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0833333358168602,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 3461.0833740234375,
"epoch": 0.09142857142857143,
"grad_norm": 0.5345960855484009,
"kl": 0.01690673828125,
"learning_rate": 8.737029101523929e-07,
"loss": -1.3097,
"reward": 0.07500000298023224,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.0833333358168602,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 2223.291748046875,
"epoch": 0.092,
"grad_norm": 0.8153612017631531,
"kl": 0.0274658203125,
"learning_rate": 8.715127058347614e-07,
"loss": -0.9097,
"reward": 0.6000000089406967,
"reward_std": 0.32863354682922363,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.625,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 2307.875,
"epoch": 0.09257142857142857,
"grad_norm": 0.5529870986938477,
"kl": 0.013397216796875,
"learning_rate": 8.693068314414344e-07,
"loss": -1.2837,
"reward": 0.5000000447034836,
"reward_std": 0.2773938328027725,
"rewards/accuracy_reward": 0.3750000149011612,
"rewards/format_reward": 0.4583333432674408,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 2981.0,
"epoch": 0.09314285714285714,
"grad_norm": 0.8973040580749512,
"kl": 0.0225830078125,
"learning_rate": 8.670853944836176e-07,
"loss": -1.6263,
"reward": 0.20000001788139343,
"reward_std": 0.21162375807762146,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.25,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 2735.875,
"epoch": 0.09371428571428571,
"grad_norm": 1.581978678703308,
"kl": 0.016082763671875,
"learning_rate": 8.648485032310144e-07,
"loss": -2.4628,
"reward": 0.42500001192092896,
"reward_std": 0.35955221951007843,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.5833333432674408,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 3489.4583740234375,
"epoch": 0.09428571428571429,
"grad_norm": 1.0414494276046753,
"kl": 0.0177001953125,
"learning_rate": 8.625962667065487e-07,
"loss": -2.4063,
"reward": 0.15000000223517418,
"reward_std": 0.26419591903686523,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.2083333395421505,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09485714285714286,
"grad_norm": 0.3711743652820587,
"kl": 0.015045166015625,
"learning_rate": 8.603287946810513e-07,
"loss": -0.6281,
"reward": 0.05000000447034836,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.0416666679084301,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 1534.9584350585938,
"epoch": 0.09542857142857143,
"grad_norm": 0.7632970809936523,
"kl": 0.02166748046875,
"learning_rate": 8.580461976679099e-07,
"loss": -0.8458,
"reward": 0.8250000774860382,
"reward_std": 0.29361626505851746,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.9166666865348816,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 2715.8333740234375,
"epoch": 0.096,
"grad_norm": 0.774446427822113,
"kl": 0.02288818359375,
"learning_rate": 8.557485869176825e-07,
"loss": -1.9958,
"reward": 0.40000003576278687,
"reward_std": 0.29662763327360153,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/format_reward": 0.4583333432674408,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 1717.4166870117188,
"epoch": 0.09657142857142857,
"grad_norm": 0.6996132135391235,
"kl": 0.01715087890625,
"learning_rate": 8.534360744126753e-07,
"loss": -1.6658,
"reward": 0.5500000268220901,
"reward_std": 0.29662764072418213,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.7916666865348816,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 3413.9583740234375,
"epoch": 0.09714285714285714,
"grad_norm": 0.6789911389350891,
"kl": 0.02880859375,
"learning_rate": 8.511087728614862e-07,
"loss": -1.5654,
"reward": 0.1250000111758709,
"reward_std": 0.2611714005470276,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.1666666679084301,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 2726.70849609375,
"epoch": 0.09771428571428571,
"grad_norm": 0.3861570954322815,
"kl": 0.01446533203125,
"learning_rate": 8.487667956935087e-07,
"loss": 0.0023,
"reward": 0.4000000059604645,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.5,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 2290.6251220703125,
"epoch": 0.09828571428571428,
"grad_norm": 0.9113690257072449,
"kl": 0.0257568359375,
"learning_rate": 8.464102570534061e-07,
"loss": -0.7408,
"reward": 0.2750000059604645,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.4583333432674408,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 1820.666748046875,
"epoch": 0.09885714285714285,
"grad_norm": 0.9667136073112488,
"kl": 0.02313232421875,
"learning_rate": 8.440392717955475e-07,
"loss": -1.6142,
"reward": 0.4750000238418579,
"reward_std": 0.26889464259147644,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.7083333432674408,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 3118.0,
"epoch": 0.09942857142857142,
"grad_norm": 0.8733185529708862,
"kl": 0.0316162109375,
"learning_rate": 8.416539554784089e-07,
"loss": -1.6321,
"reward": 0.1250000111758709,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.2083333395421505,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 1706.4583435058594,
"epoch": 0.1,
"grad_norm": 0.45300906896591187,
"kl": 0.012939453125,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0021,
"reward": 0.8000000566244125,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.6250000149011612,
"rewards/format_reward": 0.7083333432674408,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1727.5417175292969,
"epoch": 0.10057142857142858,
"grad_norm": 0.659142255783081,
"kl": 0.021392822265625,
"learning_rate": 8.368407953869103e-07,
"loss": 0.0034,
"reward": 0.6000000536441803,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/format_reward": 0.7083333432674408,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 2159.25,
"epoch": 0.10114285714285715,
"grad_norm": 0.8017681837081909,
"kl": 0.013885498046875,
"learning_rate": 8.344131861991828e-07,
"loss": 0.0022,
"reward": 0.4750000238418579,
"reward_std": 0.13869690895080566,
"rewards/accuracy_reward": 0.2916666865348816,
"rewards/format_reward": 0.5,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 3375.9583740234375,
"epoch": 0.10171428571428572,
"grad_norm": 1.2313588857650757,
"kl": 0.015167236328125,
"learning_rate": 8.319717151140072e-07,
"loss": -2.2319,
"reward": 0.17500000447034836,
"reward_std": 0.3254331722855568,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.2083333358168602,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 3116.625,
"epoch": 0.10228571428571429,
"grad_norm": 0.6798207759857178,
"kl": 0.02423095703125,
"learning_rate": 8.295165011252396e-07,
"loss": -1.4557,
"reward": 0.2750000059604645,
"reward_std": 0.26783522963523865,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.2916666865348816,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 2302.25,
"epoch": 0.10285714285714286,
"grad_norm": 0.5794097781181335,
"kl": 0.022064208984375,
"learning_rate": 8.270476638965461e-07,
"loss": -0.532,
"reward": 0.5000000298023224,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/format_reward": 0.5,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 2662.5833740234375,
"epoch": 0.10342857142857143,
"grad_norm": 0.6577425599098206,
"kl": 0.019775390625,
"learning_rate": 8.245653237555705e-07,
"loss": -1.5816,
"reward": 0.4500000402331352,
"reward_std": 0.19993416219949722,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/format_reward": 0.4583333544433117,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 2937.2501220703125,
"epoch": 0.104,
"grad_norm": 0.8629750609397888,
"kl": 0.02593994140625,
"learning_rate": 8.220696016880687e-07,
"loss": -1.5823,
"reward": 0.15000001341104507,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.2500000074505806,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 2815.2916870117188,
"epoch": 0.10457142857142857,
"grad_norm": 0.6297455430030823,
"kl": 0.02032470703125,
"learning_rate": 8.195606193320136e-07,
"loss": -0.8378,
"reward": 0.27500003576278687,
"reward_std": 0.11291590332984924,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.4166666865348816,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 1781.041748046875,
"epoch": 0.10514285714285715,
"grad_norm": 0.5706174969673157,
"kl": 0.012176513671875,
"learning_rate": 8.170384989716657e-07,
"loss": -0.743,
"reward": 0.7000000476837158,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.7083333432674408,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 1956.2500610351562,
"epoch": 0.10571428571428572,
"grad_norm": 0.8781810402870178,
"kl": 0.0162353515625,
"learning_rate": 8.145033635316128e-07,
"loss": -1.4382,
"reward": 0.45000001788139343,
"reward_std": 0.18973666429519653,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.6666666865348816,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 2789.2501220703125,
"epoch": 0.10628571428571429,
"grad_norm": 0.7274031639099121,
"kl": 0.01470947265625,
"learning_rate": 8.119553365707802e-07,
"loss": -1.4622,
"reward": 0.4500000476837158,
"reward_std": 0.36425093561410904,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.4166666716337204,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 3284.791748046875,
"epoch": 0.10685714285714286,
"grad_norm": 0.7924415469169617,
"kl": 0.03204345703125,
"learning_rate": 8.093945422764069e-07,
"loss": -1.1243,
"reward": 0.07500000298023224,
"reward_std": 0.13869690895080566,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.125,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 2522.9583740234375,
"epoch": 0.10742857142857143,
"grad_norm": 0.8641192317008972,
"kl": 0.02154541015625,
"learning_rate": 8.068211054579943e-07,
"loss": -1.8636,
"reward": 0.6000000536441803,
"reward_std": 0.3966957926750183,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.5416666865348816,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 3382.625,
"epoch": 0.108,
"grad_norm": 1.1794185638427734,
"kl": 0.0162353515625,
"learning_rate": 8.04235151541222e-07,
"loss": -2.2379,
"reward": 0.20000001043081284,
"reward_std": 0.32240865379571915,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.2500000074505806,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 2990.625,
"epoch": 0.10857142857142857,
"grad_norm": 0.6209798455238342,
"kl": 0.0179443359375,
"learning_rate": 8.01636806561836e-07,
"loss": -0.7416,
"reward": 0.125,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.2083333432674408,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 2806.8333740234375,
"epoch": 0.10914285714285714,
"grad_norm": 1.071854591369629,
"kl": 0.03564453125,
"learning_rate": 7.990261971595048e-07,
"loss": -1.7873,
"reward": 0.25,
"reward_std": 0.21162375807762146,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.3333333432674408,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 2162.95849609375,
"epoch": 0.10971428571428571,
"grad_norm": 0.6578007340431213,
"kl": 0.02581787109375,
"learning_rate": 7.964034505716476e-07,
"loss": -0.6846,
"reward": 0.550000011920929,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5416666865348816,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 3155.791748046875,
"epoch": 0.11028571428571429,
"grad_norm": 0.5719695687294006,
"kl": 0.0224609375,
"learning_rate": 7.93768694627233e-07,
"loss": -1.403,
"reward": 0.10000000521540642,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.1666666679084301,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 1952.041748046875,
"epoch": 0.11085714285714286,
"grad_norm": 1.1899938583374023,
"kl": 0.037109375,
"learning_rate": 7.911220577405484e-07,
"loss": -1.389,
"reward": 0.6500000357627869,
"reward_std": 0.37408730387687683,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.7083333730697632,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 2549.4583740234375,
"epoch": 0.11142857142857143,
"grad_norm": 0.7450345754623413,
"kl": 0.0228271484375,
"learning_rate": 7.884636689049422e-07,
"loss": -1.3458,
"reward": 0.3999999985098839,
"reward_std": 0.2658701241016388,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.5000000111758709,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 2487.7916870117188,
"epoch": 0.112,
"grad_norm": 0.5231258869171143,
"kl": 0.01495361328125,
"learning_rate": 7.857936576865356e-07,
"loss": -0.9285,
"reward": 0.375,
"reward_std": 0.08215838670730591,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.375,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 2426.3334350585938,
"epoch": 0.11257142857142857,
"grad_norm": 0.8786201477050781,
"kl": 0.0322265625,
"learning_rate": 7.831121542179086e-07,
"loss": -1.3101,
"reward": 0.42500003799796104,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.4583333544433117,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 1954.1250610351562,
"epoch": 0.11314285714285714,
"grad_norm": 0.8747029900550842,
"kl": 0.02593994140625,
"learning_rate": 7.804192891917571e-07,
"loss": -1.4407,
"reward": 0.6000000536441803,
"reward_std": 0.3504374995827675,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.6666666716337204,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 2655.416748046875,
"epoch": 0.11371428571428571,
"grad_norm": 0.8738974928855896,
"kl": 0.02545166015625,
"learning_rate": 7.777151938545235e-07,
"loss": -1.3506,
"reward": 0.20000000298023224,
"reward_std": 0.18673625588417053,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.291666679084301,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 2649.4583740234375,
"epoch": 0.11428571428571428,
"grad_norm": 0.7848045229911804,
"kl": 0.026123046875,
"learning_rate": 7.75e-07,
"loss": -1.6035,
"reward": 0.32500001788139343,
"reward_std": 0.26995331048965454,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.4166666716337204,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 2324.0833740234375,
"epoch": 0.11485714285714285,
"grad_norm": 0.6355379819869995,
"kl": 0.017059326171875,
"learning_rate": 7.72273839962904e-07,
"loss": -0.7416,
"reward": 0.30000001192092896,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.4583333432674408,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 2468.916748046875,
"epoch": 0.11542857142857142,
"grad_norm": 0.7434844374656677,
"kl": 0.017822265625,
"learning_rate": 7.695368466124296e-07,
"loss": 0.0029,
"reward": 0.40000003576278687,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.5,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 2735.2916870117188,
"epoch": 0.116,
"grad_norm": 0.4409310221672058,
"kl": 0.01507568359375,
"learning_rate": 7.667891533457718e-07,
"loss": -1.4004,
"reward": 0.30000000447034836,
"reward_std": 0.2658701241016388,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.4166666679084301,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 2205.7501220703125,
"epoch": 0.11657142857142858,
"grad_norm": 0.5194834470748901,
"kl": 0.012176513671875,
"learning_rate": 7.640308940816239e-07,
"loss": 0.002,
"reward": 0.30000001192092896,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 2431.0,
"epoch": 0.11714285714285715,
"grad_norm": 0.5818977355957031,
"kl": 0.018218994140625,
"learning_rate": 7.612622032536507e-07,
"loss": -0.722,
"reward": 0.375,
"reward_std": 0.17702671885490417,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.4583333432674408,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 2947.8751220703125,
"epoch": 0.11771428571428572,
"grad_norm": 1.0808711051940918,
"kl": 0.02557373046875,
"learning_rate": 7.584832158039378e-07,
"loss": -2.5238,
"reward": 0.20000001788139343,
"reward_std": 0.2323790118098259,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.3333333432674408,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 2864.2501220703125,
"epoch": 0.11828571428571429,
"grad_norm": 0.9076454639434814,
"kl": 0.0179443359375,
"learning_rate": 7.556940671764124e-07,
"loss": -1.2846,
"reward": 0.27500002086162567,
"reward_std": 0.2611714079976082,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.3333333358168602,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 2238.541748046875,
"epoch": 0.11885714285714286,
"grad_norm": 0.5197967886924744,
"kl": 0.0123291015625,
"learning_rate": 7.528948933102438e-07,
"loss": -0.7262,
"reward": 0.6250000298023224,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.5416666865348816,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 2626.291748046875,
"epoch": 0.11942857142857143,
"grad_norm": 0.5860099196434021,
"kl": 0.012420654296875,
"learning_rate": 7.500858306332172e-07,
"loss": -1.2149,
"reward": 0.3750000149011612,
"reward_std": 0.13869690895080566,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.375,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 2280.5000610351562,
"epoch": 0.12,
"grad_norm": 1.0006970167160034,
"kl": 0.02508544921875,
"learning_rate": 7.472670160550848e-07,
"loss": -1.5695,
"reward": 0.4000000059604645,
"reward_std": 0.17232800275087357,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.6250000298023224,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 1863.8750610351562,
"epoch": 0.12057142857142857,
"grad_norm": 0.7675609588623047,
"kl": 0.013702392578125,
"learning_rate": 7.444385869608921e-07,
"loss": -0.7409,
"reward": 0.6500000357627869,
"reward_std": 0.2658701241016388,
"rewards/accuracy_reward": 0.3750000149011612,
"rewards/format_reward": 0.7083333432674408,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 3500.916748046875,
"epoch": 0.12114285714285715,
"grad_norm": 1.411064863204956,
"kl": 0.02862548828125,
"learning_rate": 7.416006812042827e-07,
"loss": -2.3867,
"reward": 0.22500000149011612,
"reward_std": 0.3480285108089447,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.2083333395421505,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 1511.2500610351562,
"epoch": 0.12171428571428572,
"grad_norm": 0.5711240768432617,
"kl": 0.01910400390625,
"learning_rate": 7.387534371007797e-07,
"loss": 0.0031,
"reward": 0.5250000059604645,
"reward_std": 0.13869690895080566,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.75,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 2068.9584350585938,
"epoch": 0.12228571428571429,
"grad_norm": 0.3741142153739929,
"kl": 0.0205078125,
"learning_rate": 7.358969934210438e-07,
"loss": -0.5851,
"reward": 0.32500000298023224,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5416666865348816,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 2977.791748046875,
"epoch": 0.12285714285714286,
"grad_norm": 1.0709601640701294,
"kl": 0.02203369140625,
"learning_rate": 7.330314893841101e-07,
"loss": -2.8587,
"reward": 0.30000002682209015,
"reward_std": 0.32240864634513855,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.375,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 885.9583435058594,
"epoch": 0.12342857142857143,
"grad_norm": 0.8305730223655701,
"kl": 0.015655517578125,
"learning_rate": 7.301570646506027e-07,
"loss": -0.8887,
"reward": 1.0250000655651093,
"reward_std": 0.11291590332984924,
"rewards/accuracy_reward": 0.7916666865348816,
"rewards/format_reward": 0.9166666865348816,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.124,
"grad_norm": 0.4252244532108307,
"kl": 0.0159912109375,
"learning_rate": 7.27273859315928e-07,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 2363.666748046875,
"epoch": 0.12457142857142857,
"grad_norm": 0.8345184922218323,
"kl": 0.0247802734375,
"learning_rate": 7.243820139034464e-07,
"loss": -1.5168,
"reward": 0.40000003576278687,
"reward_std": 0.2323790118098259,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.5,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 838.2083740234375,
"epoch": 0.12514285714285714,
"grad_norm": 1.1001908779144287,
"kl": 0.01165771484375,
"learning_rate": 7.214816693576234e-07,
"loss": 0.0019,
"reward": 0.9750000536441803,
"reward_std": 0.13869690895080566,
"rewards/accuracy_reward": 0.625,
"rewards/format_reward": 1.0,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 2691.5833740234375,
"epoch": 0.12571428571428572,
"grad_norm": 0.48478028178215027,
"kl": 0.017822265625,
"learning_rate": 7.185729670371604e-07,
"loss": -0.9277,
"reward": 0.5,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5833333432674408,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1311.3333740234375,
"epoch": 0.12628571428571428,
"grad_norm": 0.8523876667022705,
"kl": 0.015106201171875,
"learning_rate": 7.156560487081051e-07,
"loss": 0.0024,
"reward": 0.7500000298023224,
"reward_std": 0.16431677341461182,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.75,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1624.291748046875,
"epoch": 0.12685714285714286,
"grad_norm": 0.6330533623695374,
"kl": 0.02435302734375,
"learning_rate": 7.127310565369415e-07,
"loss": -0.4816,
"reward": 0.675000011920929,
"reward_std": 0.3061862289905548,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/format_reward": 0.7916666865348816,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 2770.0,
"epoch": 0.12742857142857142,
"grad_norm": 0.8254171013832092,
"kl": 0.0244140625,
"learning_rate": 7.097981330836616e-07,
"loss": -1.1907,
"reward": 0.2500000111758709,
"reward_std": 0.24494898319244385,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.3333333544433117,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 2910.3751220703125,
"epoch": 0.128,
"grad_norm": 0.570878267288208,
"kl": 0.0177001953125,
"learning_rate": 7.068574212948169e-07,
"loss": -0.8357,
"reward": 0.4500000476837158,
"reward_std": 0.2323790118098259,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.4166666716337204,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 2329.0001220703125,
"epoch": 0.12857142857142856,
"grad_norm": 0.5944868326187134,
"kl": 0.0166015625,
"learning_rate": 7.039090644965509e-07,
"loss": -1.557,
"reward": 0.4500000402331352,
"reward_std": 0.19993416219949722,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.541666679084301,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 1627.3334350585938,
"epoch": 0.12914285714285714,
"grad_norm": 0.7897810935974121,
"kl": 0.019805908203125,
"learning_rate": 7.009532063876148e-07,
"loss": -0.7406,
"reward": 0.6000000238418579,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.2916666865348816,
"rewards/format_reward": 0.7083333432674408,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 1878.7083740234375,
"epoch": 0.12971428571428573,
"grad_norm": 0.5080549716949463,
"kl": 0.01611328125,
"learning_rate": 6.979899910323624e-07,
"loss": -0.7196,
"reward": 0.6500000357627869,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.7083333432674408,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 1201.9167175292969,
"epoch": 0.13028571428571428,
"grad_norm": 0.7728170156478882,
"kl": 0.0109405517578125,
"learning_rate": 6.950195628537299e-07,
"loss": -0.6648,
"reward": 0.7500000596046448,
"reward_std": 0.22085529565811157,
"rewards/accuracy_reward": 0.5000000298023224,
"rewards/format_reward": 0.75,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 1906.3750610351562,
"epoch": 0.13085714285714287,
"grad_norm": 0.6207183003425598,
"kl": 0.015899658203125,
"learning_rate": 6.920420666261961e-07,
"loss": -1.5599,
"reward": 0.5250000357627869,
"reward_std": 0.20463287830352783,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.625,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 2654.2084350585938,
"epoch": 0.13142857142857142,
"grad_norm": 0.7562258243560791,
"kl": 0.01849365234375,
"learning_rate": 6.890576474687263e-07,
"loss": -2.2566,
"reward": 0.5000000298023224,
"reward_std": 0.3759405389428139,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.6250000298023224,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 2595.666748046875,
"epoch": 0.132,
"grad_norm": 0.6036232709884644,
"kl": 0.024169921875,
"learning_rate": 6.860664508377001e-07,
"loss": -1.7953,
"reward": 0.3250000402331352,
"reward_std": 0.2370777204632759,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.416666679084301,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 2626.5001220703125,
"epoch": 0.13257142857142856,
"grad_norm": 1.021782398223877,
"kl": 0.035400390625,
"learning_rate": 6.83068622519821e-07,
"loss": -1.0878,
"reward": 0.27500003203749657,
"reward_std": 0.20463287830352783,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.3333333544433117,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 3502.3333740234375,
"epoch": 0.13314285714285715,
"grad_norm": 0.8032714128494263,
"kl": 0.02203369140625,
"learning_rate": 6.800643086250121e-07,
"loss": -1.0137,
"reward": 0.05000000447034836,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0833333358168602,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 2883.375,
"epoch": 0.1337142857142857,
"grad_norm": 0.8170682787895203,
"kl": 0.02398681640625,
"learning_rate": 6.770536555792944e-07,
"loss": -1.9184,
"reward": 0.2500000149011612,
"reward_std": 0.22963719069957733,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.2916666716337204,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 1954.541748046875,
"epoch": 0.13428571428571429,
"grad_norm": 0.7082000970840454,
"kl": 0.01861572265625,
"learning_rate": 6.740368101176495e-07,
"loss": -1.5973,
"reward": 0.5500000417232513,
"reward_std": 0.1741531491279602,
"rewards/accuracy_reward": 0.3333333544433117,
"rewards/format_reward": 0.583333358168602,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 1402.4166870117188,
"epoch": 0.13485714285714287,
"grad_norm": 0.6959501504898071,
"kl": 0.018798828125,
"learning_rate": 6.710139192768694e-07,
"loss": -0.742,
"reward": 0.7250000536441803,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.9583333432674408,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 1551.0834350585938,
"epoch": 0.13542857142857143,
"grad_norm": 1.2837129831314087,
"kl": 0.018768310546875,
"learning_rate": 6.679851303883891e-07,
"loss": -1.248,
"reward": 0.45000001788139343,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.75,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 2601.291748046875,
"epoch": 0.136,
"grad_norm": 0.6121569275856018,
"kl": 0.02252197265625,
"learning_rate": 6.649505910711058e-07,
"loss": -0.9744,
"reward": 0.32500000298023224,
"reward_std": 0.22555401921272278,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.375,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 2471.7500610351562,
"epoch": 0.13657142857142857,
"grad_norm": 0.7961202263832092,
"kl": 0.02349853515625,
"learning_rate": 6.619104492241847e-07,
"loss": -1.697,
"reward": 0.5,
"reward_std": 0.3340982347726822,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/format_reward": 0.5416666716337204,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 2069.291748046875,
"epoch": 0.13714285714285715,
"grad_norm": 26.108394622802734,
"kl": 0.1033935546875,
"learning_rate": 6.588648530198504e-07,
"loss": -1.4725,
"reward": 0.40000002086162567,
"reward_std": 0.17232800275087357,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.6250000149011612,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 2281.0833435058594,
"epoch": 0.1377142857142857,
"grad_norm": 0.651018500328064,
"kl": 0.02239990234375,
"learning_rate": 6.558139508961654e-07,
"loss": -0.6258,
"reward": 0.6000000759959221,
"reward_std": 0.19993416219949722,
"rewards/accuracy_reward": 0.4583333544433117,
"rewards/format_reward": 0.5416666679084301,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 2965.625,
"epoch": 0.1382857142857143,
"grad_norm": 0.9381749033927917,
"kl": 0.02398681640625,
"learning_rate": 6.527578915497951e-07,
"loss": -1.8083,
"reward": 0.15000001341104507,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.2500000074505806,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 2690.166748046875,
"epoch": 0.13885714285714285,
"grad_norm": 0.735163152217865,
"kl": 0.02117919921875,
"learning_rate": 6.496968239287603e-07,
"loss": -1.3621,
"reward": 0.2250000163912773,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.3750000223517418,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 2612.5416870117188,
"epoch": 0.13942857142857143,
"grad_norm": 0.8661508560180664,
"kl": 0.0235595703125,
"learning_rate": 6.466308972251785e-07,
"loss": -1.5249,
"reward": 0.30000001192092896,
"reward_std": 0.2323790192604065,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.4166666716337204,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 2336.8751220703125,
"epoch": 0.14,
"grad_norm": 0.9766954779624939,
"kl": 0.019775390625,
"learning_rate": 6.435602608679916e-07,
"loss": -1.3257,
"reward": 0.3500000238418579,
"reward_std": 0.2566385716199875,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.4166666716337204,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 2764.25,
"epoch": 0.14057142857142857,
"grad_norm": 0.5228642821311951,
"kl": 0.01849365234375,
"learning_rate": 6.404850645156841e-07,
"loss": -0.5755,
"reward": 0.2250000163912773,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.2916666679084301,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 3086.0001220703125,
"epoch": 0.14114285714285715,
"grad_norm": 0.7841210961341858,
"kl": 0.023162841796875,
"learning_rate": 6.374054580489873e-07,
"loss": -2.1551,
"reward": 0.45000001788139343,
"reward_std": 0.39986832439899445,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.4166666716337204,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 1750.7083740234375,
"epoch": 0.1417142857142857,
"grad_norm": 0.7208341360092163,
"kl": 0.017120361328125,
"learning_rate": 6.343215915635761e-07,
"loss": -0.7416,
"reward": 0.7250000536441803,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.7083333432674408,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 2371.8751220703125,
"epoch": 0.1422857142857143,
"grad_norm": 0.6523035168647766,
"kl": 0.018218994140625,
"learning_rate": 6.31233615362752e-07,
"loss": -0.7417,
"reward": 0.2750000059604645,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.4583333432674408,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 2791.416748046875,
"epoch": 0.14285714285714285,
"grad_norm": 0.7076215147972107,
"kl": 0.01898193359375,
"learning_rate": 6.281416799501187e-07,
"loss": -0.4655,
"reward": 0.32500001415610313,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.2916666679084301,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 1733.0000610351562,
"epoch": 0.14342857142857143,
"grad_norm": 0.5396320819854736,
"kl": 0.0194091796875,
"learning_rate": 6.25045936022246e-07,
"loss": -0.7209,
"reward": 0.5500000417232513,
"reward_std": 0.17232800275087357,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.7083333432674408,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 2229.7501220703125,
"epoch": 0.144,
"grad_norm": 1.1370456218719482,
"kl": 0.029296875,
"learning_rate": 6.219465344613258e-07,
"loss": -2.3579,
"reward": 0.30000002682209015,
"reward_std": 0.24978766590356827,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.458333358168602,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 2835.25,
"epoch": 0.14457142857142857,
"grad_norm": 0.5109323859214783,
"kl": 0.0166015625,
"learning_rate": 6.188436263278172e-07,
"loss": 0.0027,
"reward": 0.17500001192092896,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.25,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 2688.6250610351562,
"epoch": 0.14514285714285713,
"grad_norm": 0.5289558172225952,
"kl": 0.01513671875,
"learning_rate": 6.157373628530852e-07,
"loss": -1.8877,
"reward": 0.40000002086162567,
"reward_std": 0.36425092816352844,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.5,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 2283.25,
"epoch": 0.1457142857142857,
"grad_norm": 0.7051854729652405,
"kl": 0.01953125,
"learning_rate": 6.126278954320294e-07,
"loss": -1.7114,
"reward": 0.5,
"reward_std": 0.21162375062704086,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/format_reward": 0.5000000149011612,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 3230.7501220703125,
"epoch": 0.1462857142857143,
"grad_norm": 0.6201116442680359,
"kl": 0.0255126953125,
"learning_rate": 6.095153756157051e-07,
"loss": -1.4431,
"reward": 0.10000000894069672,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.1666666716337204,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 2076.9166870117188,
"epoch": 0.14685714285714285,
"grad_norm": 0.9742373824119568,
"kl": 0.02349853515625,
"learning_rate": 6.06399955103937e-07,
"loss": -1.6298,
"reward": 0.5000000298023224,
"reward_std": 0.27253394573926926,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.5833333432674408,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 1520.791748046875,
"epoch": 0.14742857142857144,
"grad_norm": 0.6934832334518433,
"kl": 0.011260986328125,
"learning_rate": 6.032817857379256e-07,
"loss": -0.7432,
"reward": 0.5250000357627869,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.7083333432674408,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 2571.0,
"epoch": 0.148,
"grad_norm": 0.5018502473831177,
"kl": 0.01641845703125,
"learning_rate": 6.001610194928464e-07,
"loss": -0.8253,
"reward": 0.27500003576278687,
"reward_std": 0.14747881889343262,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.375,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 3117.416748046875,
"epoch": 0.14857142857142858,
"grad_norm": 134.84226989746094,
"kl": 1.43408203125,
"learning_rate": 5.97037808470444e-07,
"loss": -1.1063,
"reward": 0.22500000894069672,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.25,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.14914285714285713,
"grad_norm": 0.3973945677280426,
"kl": 0.0257568359375,
"learning_rate": 5.939123048916173e-07,
"loss": -0.5713,
"reward": 0.02500000223517418,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0416666679084301,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 2326.916748046875,
"epoch": 0.14971428571428572,
"grad_norm": 0.5301626324653625,
"kl": 0.019775390625,
"learning_rate": 5.907846610890011e-07,
"loss": -0.7393,
"reward": 0.4500000476837158,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.5416666865348816,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 2577.666748046875,
"epoch": 0.15028571428571427,
"grad_norm": 0.6672086715698242,
"kl": 0.02447509765625,
"learning_rate": 5.87655029499542e-07,
"loss": -0.7189,
"reward": 0.32500000298023224,
"reward_std": 0.1596180573105812,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.375,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.15085714285714286,
"grad_norm": 0.40429234504699707,
"kl": 0.02337646484375,
"learning_rate": 5.845235626570683e-07,
"loss": 0.0037,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 3360.25,
"epoch": 0.15142857142857144,
"grad_norm": 0.6446682214736938,
"kl": 0.025390625,
"learning_rate": 5.813904131848564e-07,
"loss": -0.616,
"reward": 0.05000000447034836,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0833333358168602,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 2176.416748046875,
"epoch": 0.152,
"grad_norm": 0.7718228101730347,
"kl": 0.031494140625,
"learning_rate": 5.78255733788191e-07,
"loss": -0.4889,
"reward": 0.3750000149011612,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.5416666865348816,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 1419.5416870117188,
"epoch": 0.15257142857142858,
"grad_norm": 0.5885297656059265,
"kl": 0.018798828125,
"learning_rate": 5.751196772469237e-07,
"loss": 0.003,
"reward": 0.5500000417232513,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.75,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 2091.0000610351562,
"epoch": 0.15314285714285714,
"grad_norm": 0.7558290958404541,
"kl": 0.01812744140625,
"learning_rate": 5.71982396408026e-07,
"loss": -0.5488,
"reward": 0.45000001788139343,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.5416666865348816,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 3082.3333740234375,
"epoch": 0.15371428571428572,
"grad_norm": 0.9141106009483337,
"kl": 0.021484375,
"learning_rate": 5.688440441781398e-07,
"loss": -2.2151,
"reward": 0.15000001341104507,
"reward_std": 0.2323790118098259,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.2500000074505806,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 1514.1666870117188,
"epoch": 0.15428571428571428,
"grad_norm": 0.8448731899261475,
"kl": 0.01568603515625,
"learning_rate": 5.657047735161255e-07,
"loss": -0.7411,
"reward": 0.675000011920929,
"reward_std": 0.13869690895080566,
"rewards/accuracy_reward": 0.4166666716337204,
"rewards/format_reward": 0.7083333432674408,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 1341.916748046875,
"epoch": 0.15485714285714286,
"grad_norm": 1.3789052963256836,
"kl": 0.020538330078125,
"learning_rate": 5.625647374256061e-07,
"loss": -0.7409,
"reward": 0.6500000357627869,
"reward_std": 0.19993415474891663,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.9583333432674408,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 2583.7083740234375,
"epoch": 0.15542857142857142,
"grad_norm": 1.2276079654693604,
"kl": 0.0244140625,
"learning_rate": 5.594240889475106e-07,
"loss": -2.3455,
"reward": 0.40000002086162567,
"reward_std": 0.36425092816352844,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.5,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 2224.0,
"epoch": 0.156,
"grad_norm": 0.45050886273384094,
"kl": 0.022003173828125,
"learning_rate": 5.562829811526154e-07,
"loss": 0.0035,
"reward": 0.45000001788139343,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.5,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 2945.0833740234375,
"epoch": 0.15657142857142858,
"grad_norm": 0.7557133436203003,
"kl": 0.01849365234375,
"learning_rate": 5.531415671340826e-07,
"loss": 0.003,
"reward": 0.2250000238418579,
"reward_std": 0.08215838670730591,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.25,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 1650.8333740234375,
"epoch": 0.15714285714285714,
"grad_norm": 0.675412654876709,
"kl": 0.014434814453125,
"learning_rate": 5.5e-07,
"loss": -0.7416,
"reward": 0.5000000149011612,
"reward_std": 0.19993415474891663,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.7083333432674408,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 1545.3334350585938,
"epoch": 0.15771428571428572,
"grad_norm": 0.7588775157928467,
"kl": 0.018646240234375,
"learning_rate": 5.468584328659172e-07,
"loss": -1.4864,
"reward": 0.9750000238418579,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.7083333432674408,
"rewards/format_reward": 0.9166666865348816,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 2915.791748046875,
"epoch": 0.15828571428571428,
"grad_norm": 1.0028012990951538,
"kl": 0.0211181640625,
"learning_rate": 5.437170188473847e-07,
"loss": -1.5921,
"reward": 0.3499999940395355,
"reward_std": 0.26039472222328186,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.375,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 2256.25,
"epoch": 0.15885714285714286,
"grad_norm": 0.644159734249115,
"kl": 0.015411376953125,
"learning_rate": 5.405759110524894e-07,
"loss": 0.0025,
"reward": 0.40000003576278687,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.5,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 2127.9584350585938,
"epoch": 0.15942857142857142,
"grad_norm": 1.0713709592819214,
"kl": 0.0250244140625,
"learning_rate": 5.37435262574394e-07,
"loss": -2.5405,
"reward": 0.5500000268220901,
"reward_std": 0.2861757129430771,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.75,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 2087.2083740234375,
"epoch": 0.16,
"grad_norm": 0.44367942214012146,
"kl": 0.014678955078125,
"learning_rate": 5.342952264838747e-07,
"loss": -0.7211,
"reward": 0.5000000298023224,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.625,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 1533.0417175292969,
"epoch": 0.16057142857142856,
"grad_norm": 1.1346826553344727,
"kl": 0.015472412109375,
"learning_rate": 5.311559558218603e-07,
"loss": -0.7422,
"reward": 0.5500000417232513,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.6666666865348816,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 2707.9166870117188,
"epoch": 0.16114285714285714,
"grad_norm": 0.7678596377372742,
"kl": 0.0263671875,
"learning_rate": 5.28017603591974e-07,
"loss": -0.7124,
"reward": 0.30000001192092896,
"reward_std": 0.20871606469154358,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.375,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 2104.8333740234375,
"epoch": 0.16171428571428573,
"grad_norm": 0.3849073052406311,
"kl": 0.01458740234375,
"learning_rate": 5.248803227530763e-07,
"loss": 0.0023,
"reward": 0.32500000298023224,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.5,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 2547.9166870117188,
"epoch": 0.16228571428571428,
"grad_norm": 1.1764817237854004,
"kl": 0.01708984375,
"learning_rate": 5.21744266211809e-07,
"loss": -2.3522,
"reward": 0.5750000178813934,
"reward_std": 0.3804733455181122,
"rewards/accuracy_reward": 0.4583333432674408,
"rewards/format_reward": 0.5000000298023224,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 2855.8333740234375,
"epoch": 0.16285714285714287,
"grad_norm": 0.8596624135971069,
"kl": 0.0228271484375,
"learning_rate": 5.186095868151436e-07,
"loss": -1.918,
"reward": 0.2500000111758709,
"reward_std": 0.2773938328027725,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.3333333544433117,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 2617.541748046875,
"epoch": 0.16342857142857142,
"grad_norm": 1.1564656496047974,
"kl": 0.0211181640625,
"learning_rate": 5.154764373429315e-07,
"loss": -2.6721,
"reward": 0.6750000417232513,
"reward_std": 0.37711599469184875,
"rewards/accuracy_reward": 0.5000000149011612,
"rewards/format_reward": 0.625,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 1809.9584350585938,
"epoch": 0.164,
"grad_norm": 0.8912317156791687,
"kl": 0.017974853515625,
"learning_rate": 5.123449705004581e-07,
"loss": -1.9258,
"reward": 0.5750000476837158,
"reward_std": 0.3480285108089447,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.7083333432674408,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 2473.6666870117188,
"epoch": 0.16457142857142856,
"grad_norm": 0.6116588115692139,
"kl": 0.01898193359375,
"learning_rate": 5.09215338910999e-07,
"loss": -0.8855,
"reward": 0.375,
"reward_std": 0.13869690895080566,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.4166666865348816,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 1470.0416870117188,
"epoch": 0.16514285714285715,
"grad_norm": 0.5574433207511902,
"kl": 0.017059326171875,
"learning_rate": 5.060876951083828e-07,
"loss": -0.7664,
"reward": 0.6250000298023224,
"reward_std": 0.21615658700466156,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.9166666865348816,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 1707.041748046875,
"epoch": 0.1657142857142857,
"grad_norm": 1.1696451902389526,
"kl": 0.0301513671875,
"learning_rate": 5.02962191529556e-07,
"loss": -1.657,
"reward": 0.5250000208616257,
"reward_std": 0.23356524109840393,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.7916666865348816,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 2784.0001220703125,
"epoch": 0.1662857142857143,
"grad_norm": 1.8924946784973145,
"kl": 0.0260009765625,
"learning_rate": 4.998389805071536e-07,
"loss": -2.9789,
"reward": 0.32500001788139343,
"reward_std": 0.33025872707366943,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.4583333432674408,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 1551.4166870117188,
"epoch": 0.16685714285714287,
"grad_norm": 1.2713048458099365,
"kl": 0.01983642578125,
"learning_rate": 4.967182142620745e-07,
"loss": -2.4029,
"reward": 0.5250000208616257,
"reward_std": 0.27286098897457123,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.7916666865348816,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1370.5417175292969,
"epoch": 0.16742857142857143,
"grad_norm": 0.436497300863266,
"kl": 0.013275146484375,
"learning_rate": 4.93600044896063e-07,
"loss": 0.0021,
"reward": 0.6750000417232513,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.3750000223517418,
"rewards/format_reward": 0.75,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 2411.7500610351562,
"epoch": 0.168,
"grad_norm": 0.9047071933746338,
"kl": 0.021728515625,
"learning_rate": 4.904846243842949e-07,
"loss": -1.6188,
"reward": 0.30000003427267075,
"reward_std": 0.1549193412065506,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5000000223517418,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 2314.9583740234375,
"epoch": 0.16857142857142857,
"grad_norm": 0.8080445528030396,
"kl": 0.02862548828125,
"learning_rate": 4.873721045679706e-07,
"loss": -2.1152,
"reward": 0.30000003427267075,
"reward_std": 0.19993416219949722,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5000000223517418,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 2848.9583740234375,
"epoch": 0.16914285714285715,
"grad_norm": 0.5764908790588379,
"kl": 0.02508544921875,
"learning_rate": 4.842626371469149e-07,
"loss": -0.6015,
"reward": 0.17500000819563866,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.25,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 2570.166748046875,
"epoch": 0.1697142857142857,
"grad_norm": 1.0034595727920532,
"kl": 0.02734375,
"learning_rate": 4.811563736721829e-07,
"loss": -2.3512,
"reward": 0.45000000298023224,
"reward_std": 0.2658701241016388,
"rewards/accuracy_reward": 0.3333333544433117,
"rewards/format_reward": 0.4166666865348816,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 1806.4584350585938,
"epoch": 0.1702857142857143,
"grad_norm": 0.8656820058822632,
"kl": 0.0198974609375,
"learning_rate": 4.780534655386743e-07,
"loss": -1.6228,
"reward": 0.5250000506639481,
"reward_std": 0.29361625015735626,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.6666666716337204,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 1468.416748046875,
"epoch": 0.17085714285714285,
"grad_norm": 1.4623134136199951,
"kl": 0.025634765625,
"learning_rate": 4.749540639777539e-07,
"loss": -2.5606,
"reward": 0.4750000089406967,
"reward_std": 0.31102490425109863,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.7083333432674408,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 2191.3333740234375,
"epoch": 0.17142857142857143,
"grad_norm": 0.7832638621330261,
"kl": 0.02301025390625,
"learning_rate": 4.7185832004988133e-07,
"loss": -1.4816,
"reward": 0.32500001788139343,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5416666865348816,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 2002.916748046875,
"epoch": 0.172,
"grad_norm": 0.4890463948249817,
"kl": 0.013916015625,
"learning_rate": 1.5267358321348285e-07,
"loss": -0.6248,
"reward": 0.3500000238418579,
"reward_std": 0.0774596706032753,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5833333432674408,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 1066.3333435058594,
"epoch": 0.17257142857142857,
"grad_norm": 1.2706708908081055,
"kl": 0.01739501953125,
"learning_rate": 1.5058639494795067e-07,
"loss": -1.6018,
"reward": 0.7750000357627869,
"reward_std": 0.20295868068933487,
"rewards/accuracy_reward": 0.4166666716337204,
"rewards/format_reward": 0.8750000298023224,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 1463.7083435058594,
"epoch": 0.17314285714285715,
"grad_norm": 0.5108232498168945,
"kl": 0.01483154296875,
"learning_rate": 1.485389347912525e-07,
"loss": -0.8718,
"reward": 0.6500000357627869,
"reward_std": 0.22085529565811157,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.9166666865348816,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 2589.916748046875,
"epoch": 0.1737142857142857,
"grad_norm": 1.3276212215423584,
"kl": 0.013671875,
"learning_rate": 1.4653140639624066e-07,
"loss": -3.066,
"reward": 0.45000001788139343,
"reward_std": 0.43685072660446167,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.5833333432674408,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 2135.0,
"epoch": 0.1742857142857143,
"grad_norm": 1.30924654006958,
"kl": 0.012664794921875,
"learning_rate": 1.4456400944391144e-07,
"loss": -0.9444,
"reward": 0.3750000223517418,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.5833333358168602,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 1213.0833740234375,
"epoch": 0.17485714285714285,
"grad_norm": 1.3626219034194946,
"kl": 0.02374267578125,
"learning_rate": 1.4263693962354336e-07,
"loss": -1.6655,
"reward": 0.7500000596046448,
"reward_std": 0.3175487816333771,
"rewards/accuracy_reward": 0.4166666865348816,
"rewards/format_reward": 0.8333333432674408,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 2899.25,
"epoch": 0.17542857142857143,
"grad_norm": 0.6594395041465759,
"kl": 0.01434326171875,
"learning_rate": 1.4075038861323302e-07,
"loss": -0.8706,
"reward": 0.27500003576278687,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.3333333432674408,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 1401.2500915527344,
"epoch": 0.176,
"grad_norm": 0.5402439832687378,
"kl": 0.016876220703125,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0028,
"reward": 0.9000000357627869,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.75,
"rewards/format_reward": 0.75,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 2200.416748046875,
"epoch": 0.17657142857142857,
"grad_norm": 0.667547881603241,
"kl": 0.024871826171875,
"learning_rate": 1.3709958956526974e-07,
"loss": -1.5809,
"reward": 0.32500000298023224,
"reward_std": 0.1596180498600006,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.5416666865348816,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 2872.5833740234375,
"epoch": 0.17714285714285713,
"grad_norm": 0.6462530493736267,
"kl": 0.014068603515625,
"learning_rate": 1.353357046583165e-07,
"loss": -1.4915,
"reward": 0.30000001192092896,
"reward_std": 0.2258318066596985,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.3750000149011612,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 2144.3333740234375,
"epoch": 0.1777142857142857,
"grad_norm": 0.760235607624054,
"kl": 0.012969970703125,
"learning_rate": 1.3361306478670148e-07,
"loss": -2.1933,
"reward": 0.3500000089406967,
"reward_std": 0.23826396465301514,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.5416666865348816,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 1499.1250610351562,
"epoch": 0.1782857142857143,
"grad_norm": 1.0971490144729614,
"kl": 0.02130126953125,
"learning_rate": 1.3193184129467384e-07,
"loss": -0.6513,
"reward": 0.4750000238418579,
"reward_std": 0.06123724579811096,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.7916666865348816,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 1944.8751220703125,
"epoch": 0.17885714285714285,
"grad_norm": 0.6792640089988708,
"kl": 0.01654052734375,
"learning_rate": 1.3029220140695756e-07,
"loss": -0.4889,
"reward": 0.3750000298023224,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.5416666865348816,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 1524.5000610351562,
"epoch": 0.17942857142857144,
"grad_norm": 1.003913402557373,
"kl": 0.015625,
"learning_rate": 1.2869430821211826e-07,
"loss": -1.2384,
"reward": 0.7500000298023224,
"reward_std": 0.2173428237438202,
"rewards/accuracy_reward": 0.5416666865348816,
"rewards/format_reward": 0.7083333432674408,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 2062.4166870117188,
"epoch": 0.18,
"grad_norm": 0.8467549085617065,
"kl": 0.01544189453125,
"learning_rate": 1.2713832064634125e-07,
"loss": -1.1761,
"reward": 0.40000002086162567,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.6666666716337204,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 1394.25,
"epoch": 0.18057142857142858,
"grad_norm": 1.1105024814605713,
"kl": 0.021453857421875,
"learning_rate": 1.2562439347762275e-07,
"loss": -1.5211,
"reward": 0.5500000417232513,
"reward_std": 0.19993415474891663,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.875,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 1474.3333740234375,
"epoch": 0.18114285714285713,
"grad_norm": 0.979469358921051,
"kl": 0.011138916015625,
"learning_rate": 1.2415267729037608e-07,
"loss": -2.2617,
"reward": 0.6500000059604645,
"reward_std": 0.3548534959554672,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/format_reward": 0.7916666865348816,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 1628.416748046875,
"epoch": 0.18171428571428572,
"grad_norm": 0.563599705696106,
"kl": 0.01849365234375,
"learning_rate": 1.2272331847045313e-07,
"loss": 0.003,
"reward": 0.7500000894069672,
"reward_std": 0.19993416219949722,
"rewards/accuracy_reward": 0.5000000223517418,
"rewards/format_reward": 0.75,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 1559.5833435058594,
"epoch": 0.18228571428571427,
"grad_norm": 0.9234899282455444,
"kl": 0.01715087890625,
"learning_rate": 1.2133645919058418e-07,
"loss": -1.6808,
"reward": 0.6250000149011612,
"reward_std": 0.19037556648254395,
"rewards/accuracy_reward": 0.2916666679084301,
"rewards/format_reward": 0.75,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 1388.8333740234375,
"epoch": 0.18285714285714286,
"grad_norm": 1.0862469673156738,
"kl": 0.020263671875,
"learning_rate": 1.1999223739623666e-07,
"loss": -1.6945,
"reward": 0.6000000089406967,
"reward_std": 0.2658701241016388,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.8333333432674408,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 2178.0001220703125,
"epoch": 0.18342857142857144,
"grad_norm": 1.5233582258224487,
"kl": 0.022247314453125,
"learning_rate": 1.1869078679189393e-07,
"loss": -1.4616,
"reward": 0.32500001788139343,
"reward_std": 0.21615657955408096,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.458333358168602,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 2045.166748046875,
"epoch": 0.184,
"grad_norm": 1.028342843055725,
"kl": 0.01947021484375,
"learning_rate": 1.1743223682775649e-07,
"loss": -1.4788,
"reward": 0.40000002086162567,
"reward_std": 0.23826396465301514,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.5833333432674408,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 1766.9583740234375,
"epoch": 0.18457142857142858,
"grad_norm": 1.2755353450775146,
"kl": 0.0216064453125,
"learning_rate": 1.1621671268686605e-07,
"loss": -2.4408,
"reward": 0.5000000298023224,
"reward_std": 0.3290724903345108,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.7083333432674408,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 1399.8750610351562,
"epoch": 0.18514285714285714,
"grad_norm": 0.7948786020278931,
"kl": 0.01861572265625,
"learning_rate": 1.1504433527265378e-07,
"loss": -1.615,
"reward": 0.6000000089406967,
"reward_std": 0.31572362780570984,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.8333333432674408,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 1793.8334350585938,
"epoch": 0.18571428571428572,
"grad_norm": 1.1149201393127441,
"kl": 0.02008056640625,
"learning_rate": 1.1391522119691496e-07,
"loss": -2.113,
"reward": 0.4750000238418579,
"reward_std": 0.3386310636997223,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.6666666865348816,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 1259.4583740234375,
"epoch": 0.18628571428571428,
"grad_norm": 0.9218916296958923,
"kl": 0.024658203125,
"learning_rate": 1.1282948276820962e-07,
"loss": -0.7426,
"reward": 0.6000000536441803,
"reward_std": 0.22085530310869217,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.8333333432674408,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 1897.2083740234375,
"epoch": 0.18685714285714286,
"grad_norm": 1.3489266633987427,
"kl": 0.02215576171875,
"learning_rate": 1.1178722798069215e-07,
"loss": -2.2226,
"reward": 0.5000000149011612,
"reward_std": 0.2773938328027725,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.6666666865348816,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 1416.5833435058594,
"epoch": 0.18742857142857142,
"grad_norm": 0.8123278617858887,
"kl": 0.02020263671875,
"learning_rate": 1.10788560503369e-07,
"loss": 0.0032,
"reward": 0.5500000268220901,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.75,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 1962.4583740234375,
"epoch": 0.188,
"grad_norm": 0.5757811665534973,
"kl": 0.019775390625,
"learning_rate": 1.0983357966978745e-07,
"loss": -0.9902,
"reward": 0.42499999701976776,
"reward_std": 0.20463287830352783,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.625,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 1766.2500610351562,
"epoch": 0.18857142857142858,
"grad_norm": 0.9221534132957458,
"kl": 0.01800537109375,
"learning_rate": 1.0892238046815527e-07,
"loss": -1.6717,
"reward": 0.5500000268220901,
"reward_std": 0.2983149588108063,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.7083333432674408,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 1579.5833435058594,
"epoch": 0.18914285714285714,
"grad_norm": 0.9036383628845215,
"kl": 0.01678466796875,
"learning_rate": 1.0805505353189254e-07,
"loss": -2.0184,
"reward": 0.5750000476837158,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.7083333432674408,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 1057.0,
"epoch": 0.18971428571428572,
"grad_norm": 0.5648021697998047,
"kl": 0.010589599609375,
"learning_rate": 1.0723168513061665e-07,
"loss": -0.7254,
"reward": 0.7000000476837158,
"reward_std": 0.17232800275087357,
"rewards/accuracy_reward": 0.2083333395421505,
"rewards/format_reward": 0.9583333432674408,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 1274.291748046875,
"epoch": 0.19028571428571428,
"grad_norm": 0.8038429617881775,
"kl": 0.01422119140625,
"learning_rate": 1.0645235716156168e-07,
"loss": -1.4998,
"reward": 0.5500000417232513,
"reward_std": 0.17232800275087357,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.8750000298023224,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 1840.2084350585938,
"epoch": 0.19085714285714286,
"grad_norm": 0.7369369864463806,
"kl": 0.01922607421875,
"learning_rate": 1.0571714714143197e-07,
"loss": -0.7416,
"reward": 0.5750000476837158,
"reward_std": 0.18371173739433289,
"rewards/accuracy_reward": 0.2916666865348816,
"rewards/format_reward": 0.6666666865348816,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 1033.6666870117188,
"epoch": 0.19142857142857142,
"grad_norm": 0.9152414202690125,
"kl": 0.012664794921875,
"learning_rate": 1.0502612819869216e-07,
"loss": -0.759,
"reward": 0.5750000476837158,
"reward_std": 0.11291590332984924,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.9166666865348816,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 929.3333435058594,
"epoch": 0.192,
"grad_norm": 0.354445219039917,
"kl": 0.016204833984375,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0026,
"reward": 0.7750000357627869,
"reward_std": 0.1596180573105812,
"rewards/accuracy_reward": 0.2916666716337204,
"rewards/format_reward": 1.0,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 1763.0000915527344,
"epoch": 0.19257142857142856,
"grad_norm": 1.0156564712524414,
"kl": 0.015411376953125,
"learning_rate": 1.0377693407483638e-07,
"loss": -1.6813,
"reward": 0.6500000357627869,
"reward_std": 0.2658701241016388,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.8333333432674408,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 1598.7083435058594,
"epoch": 0.19314285714285714,
"grad_norm": 0.8538982272148132,
"kl": 0.021728515625,
"learning_rate": 1.032188831461732e-07,
"loss": -1.2843,
"reward": 0.7000000178813934,
"reward_std": 0.2658701241016388,
"rewards/accuracy_reward": 0.4166666865348816,
"rewards/format_reward": 0.7500000298023224,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 1587.2083740234375,
"epoch": 0.19371428571428573,
"grad_norm": 1.12893545627594,
"kl": 0.01947021484375,
"learning_rate": 1.0270527178744664e-07,
"loss": -1.5945,
"reward": 0.5000000447034836,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.8333333432674408,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 1353.5833740234375,
"epoch": 0.19428571428571428,
"grad_norm": 0.6671582460403442,
"kl": 0.01910400390625,
"learning_rate": 1.0223615108556937e-07,
"loss": -0.7477,
"reward": 0.6000000536441803,
"reward_std": 0.20871604979038239,
"rewards/accuracy_reward": 0.1250000037252903,
"rewards/format_reward": 0.875,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 1181.7500610351562,
"epoch": 0.19485714285714287,
"grad_norm": 0.725435733795166,
"kl": 0.01190185546875,
"learning_rate": 1.0181156770214242e-07,
"loss": -1.4692,
"reward": 0.5750000476837158,
"reward_std": 0.15610557794570923,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.9166666865348816,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 870.9166870117188,
"epoch": 0.19542857142857142,
"grad_norm": 1.4129509925842285,
"kl": 0.017486572265625,
"learning_rate": 1.0143156386881408e-07,
"loss": -0.742,
"reward": 0.7250000834465027,
"reward_std": 0.21615658700466156,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.9583333432674408,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 1194.5000610351562,
"epoch": 0.196,
"grad_norm": 1.0423340797424316,
"kl": 0.0213623046875,
"learning_rate": 1.0109617738307911e-07,
"loss": -1.4624,
"reward": 0.6500000357627869,
"reward_std": 0.24494898319244385,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.9166666865348816,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 1545.8750610351562,
"epoch": 0.19657142857142856,
"grad_norm": 0.3742052912712097,
"kl": 0.0191650390625,
"learning_rate": 1.0080544160451918e-07,
"loss": 0.0031,
"reward": 0.5500000417232513,
"reward_std": 0.14339563250541687,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.75,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 1445.9583435058594,
"epoch": 0.19714285714285715,
"grad_norm": 1.0773917436599731,
"kl": 0.019287109375,
"learning_rate": 1.0055938545148495e-07,
"loss": -0.5323,
"reward": 0.7500000596046448,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.5000000298023224,
"rewards/format_reward": 0.75,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 1790.5000610351562,
"epoch": 0.1977142857142857,
"grad_norm": 0.7913344502449036,
"kl": 0.0220947265625,
"learning_rate": 1.0035803339821934e-07,
"loss": -1.419,
"reward": 0.550000011920929,
"reward_std": 0.3209003359079361,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.7500000298023224,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 1382.2916870117188,
"epoch": 0.1982857142857143,
"grad_norm": 1.1861852407455444,
"kl": 0.015045166015625,
"learning_rate": 1.002014054724235e-07,
"loss": -1.5114,
"reward": 0.5750000178813934,
"reward_std": 0.21615657955408096,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.8750000298023224,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 1589.2500610351562,
"epoch": 0.19885714285714284,
"grad_norm": 0.8138798475265503,
"kl": 0.025634765625,
"learning_rate": 1.0008951725326441e-07,
"loss": -1.7143,
"reward": 0.4750000238418579,
"reward_std": 0.1596180573105812,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.7916666865348816,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 1109.5417175292969,
"epoch": 0.19942857142857143,
"grad_norm": 0.814863383769989,
"kl": 0.024444580078125,
"learning_rate": 1.0002237986982564e-07,
"loss": -0.7403,
"reward": 0.7750000357627869,
"reward_std": 0.13869691640138626,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.9583333432674408,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 1071.9166870117188,
"epoch": 0.2,
"grad_norm": 0.7820748090744019,
"kl": 0.01959228515625,
"learning_rate": 1e-07,
"loss": -1.4839,
"reward": 0.7000000476837158,
"reward_std": 0.12247449159622192,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.9166666865348816,
"step": 350
},
{
"epoch": 0.2,
"step": 350,
"total_flos": 0.0,
"train_loss": -0.17849066271579692,
"train_runtime": 2375.2815,
"train_samples_per_second": 3.536,
"train_steps_per_second": 0.147
}
],
"logging_steps": 1,
"max_steps": 350,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}