Files
OpenRS-GRPO-S-2/trainer_state.json
ModelHub XC ddfd33fab1 初始化项目,由ModelHub XC社区提供模型
Model: mimoidochi/OpenRS-GRPO-S-2
Source: Original Platform
2026-06-01 00:21:17 +08:00

4593 lines
135 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"eval_steps": 500,
"global_step": 350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 3134.95849609375,
"epoch": 0.0005714285714285715,
"grad_norm": 0.7448968291282654,
"kl": 0.0,
"learning_rate": 2.857142857142857e-08,
"loss": -0.9387,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 2868.9583740234375,
"epoch": 0.001142857142857143,
"grad_norm": 0.39778050780296326,
"kl": 0.0,
"learning_rate": 5.714285714285714e-08,
"loss": -0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 2831.75,
"epoch": 0.0017142857142857142,
"grad_norm": 1.477035641670227,
"kl": 2.5451183319091797e-05,
"learning_rate": 8.571428571428572e-08,
"loss": -1.9087,
"reward": 0.2916666865348816,
"reward_std": 0.26603007316589355,
"rewards/accuracy_reward": 0.2916666865348816,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 2640.375,
"epoch": 0.002285714285714286,
"grad_norm": 1.3637254238128662,
"kl": 2.60770320892334e-05,
"learning_rate": 1.1428571428571427e-07,
"loss": -0.9903,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2990.416748046875,
"epoch": 0.002857142857142857,
"grad_norm": 0.9073159694671631,
"kl": 4.756450653076172e-05,
"learning_rate": 1.4285714285714285e-07,
"loss": -0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 2824.9583740234375,
"epoch": 0.0034285714285714284,
"grad_norm": 0.2965989410877228,
"kl": 3.439188003540039e-05,
"learning_rate": 1.7142857142857143e-07,
"loss": -0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 3328.125,
"epoch": 0.004,
"grad_norm": 0.6783003807067871,
"kl": 3.141164779663086e-05,
"learning_rate": 2e-07,
"loss": -0.735,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 2861.7501220703125,
"epoch": 0.004571428571428572,
"grad_norm": 0.6755802035331726,
"kl": 2.0116567611694336e-05,
"learning_rate": 2.2857142857142855e-07,
"loss": -0.733,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2332.33349609375,
"epoch": 0.005142857142857143,
"grad_norm": 1.6076233386993408,
"kl": 4.553794860839844e-05,
"learning_rate": 2.571428571428571e-07,
"loss": -1.4705,
"reward": 0.2500000111758709,
"reward_std": 0.20412415266036987,
"rewards/accuracy_reward": 0.2500000111758709,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 2922.7083740234375,
"epoch": 0.005714285714285714,
"grad_norm": 1.9102457761764526,
"kl": 2.950429916381836e-05,
"learning_rate": 2.857142857142857e-07,
"loss": -2.88,
"reward": 0.2916666716337204,
"reward_std": 0.395129531621933,
"rewards/accuracy_reward": 0.2916666716337204,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2510.08349609375,
"epoch": 0.006285714285714286,
"grad_norm": 2.036496639251709,
"kl": 5.0187110900878906e-05,
"learning_rate": 3.142857142857143e-07,
"loss": -1.6748,
"reward": 0.2083333395421505,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.2083333395421505,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 2951.5001220703125,
"epoch": 0.006857142857142857,
"grad_norm": 3.159651279449463,
"kl": 3.3855438232421875e-05,
"learning_rate": 3.4285714285714286e-07,
"loss": -1.9348,
"reward": 0.2083333358168602,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.2083333358168602,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 2773.166748046875,
"epoch": 0.0074285714285714285,
"grad_norm": 0.7064540386199951,
"kl": 1.9043684005737305e-05,
"learning_rate": 3.7142857142857145e-07,
"loss": -1.4774,
"reward": 0.25,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.25,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 2341.0833740234375,
"epoch": 0.008,
"grad_norm": 0.7222815155982971,
"kl": 4.0650367736816406e-05,
"learning_rate": 4e-07,
"loss": -0.7442,
"reward": 0.4583333432674408,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.4583333432674408,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 2472.791748046875,
"epoch": 0.008571428571428572,
"grad_norm": 0.6996123790740967,
"kl": 2.658367156982422e-05,
"learning_rate": 4.285714285714285e-07,
"loss": -0.7392,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 3173.7501220703125,
"epoch": 0.009142857142857144,
"grad_norm": 1.0215026140213013,
"kl": 4.792213439941406e-05,
"learning_rate": 4.571428571428571e-07,
"loss": -0.9988,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 3044.916748046875,
"epoch": 0.009714285714285713,
"grad_norm": 0.6144117116928101,
"kl": 4.780292510986328e-05,
"learning_rate": 4.857142857142857e-07,
"loss": -0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 3092.2083740234375,
"epoch": 0.010285714285714285,
"grad_norm": 1.0112086534500122,
"kl": 4.398822784423828e-05,
"learning_rate": 5.142857142857142e-07,
"loss": -1.7343,
"reward": 0.1666666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 2726.7501220703125,
"epoch": 0.010857142857142857,
"grad_norm": 0.9500206112861633,
"kl": 5.0067901611328125e-05,
"learning_rate": 5.428571428571428e-07,
"loss": -0.9414,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 1488.6250915527344,
"epoch": 0.011428571428571429,
"grad_norm": 1.102255940437317,
"kl": 2.771615982055664e-05,
"learning_rate": 5.714285714285714e-07,
"loss": -1.685,
"reward": 0.375,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.375,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1883.8334350585938,
"epoch": 0.012,
"grad_norm": 2.169161558151245,
"kl": 4.553794860839844e-05,
"learning_rate": 6e-07,
"loss": -2.3845,
"reward": 0.1666666716337204,
"reward_std": 0.3332235962152481,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 2890.416748046875,
"epoch": 0.012571428571428572,
"grad_norm": 3.423088550567627,
"kl": 3.8743019104003906e-05,
"learning_rate": 6.285714285714286e-07,
"loss": -3.6207,
"reward": 0.5000000298023224,
"reward_std": 0.49719157814979553,
"rewards/accuracy_reward": 0.5000000298023224,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 2703.9583740234375,
"epoch": 0.013142857142857144,
"grad_norm": 1.8604422807693481,
"kl": 5.328655242919922e-05,
"learning_rate": 6.571428571428571e-07,
"loss": -1.8681,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 2797.5,
"epoch": 0.013714285714285714,
"grad_norm": 1.5852028131484985,
"kl": 6.42538070678711e-05,
"learning_rate": 6.857142857142857e-07,
"loss": -1.6552,
"reward": 0.291666679084301,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.291666679084301,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1578.5417175292969,
"epoch": 0.014285714285714285,
"grad_norm": 1.9514633417129517,
"kl": 5.4836273193359375e-05,
"learning_rate": 7.142857142857143e-07,
"loss": -2.4821,
"reward": 0.375,
"reward_std": 0.3410547971725464,
"rewards/accuracy_reward": 0.375,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 2654.666748046875,
"epoch": 0.014857142857142857,
"grad_norm": 1.7990684509277344,
"kl": 7.772445678710938e-05,
"learning_rate": 7.428571428571429e-07,
"loss": -1.8798,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 3421.9583740234375,
"epoch": 0.015428571428571429,
"grad_norm": 0.19747313857078552,
"kl": 9.441375732421875e-05,
"learning_rate": 7.714285714285714e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 2514.3751220703125,
"epoch": 0.016,
"grad_norm": 1.2808500528335571,
"kl": 8.916854858398438e-05,
"learning_rate": 8e-07,
"loss": -1.7419,
"reward": 0.1666666679084301,
"reward_std": 0.23899271339178085,
"rewards/accuracy_reward": 0.1666666679084301,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 2835.9583740234375,
"epoch": 0.01657142857142857,
"grad_norm": 2.1129915714263916,
"kl": 9.989738464355469e-05,
"learning_rate": 8.285714285714285e-07,
"loss": -1.9361,
"reward": 0.2083333432674408,
"reward_std": 0.26603007316589355,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 3564.416748046875,
"epoch": 0.017142857142857144,
"grad_norm": 1.2208162546157837,
"kl": 0.0001220703125,
"learning_rate": 8.57142857142857e-07,
"loss": -1.8741,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2305.2916870117188,
"epoch": 0.017714285714285714,
"grad_norm": 1.7113858461380005,
"kl": 0.00023293495178222656,
"learning_rate": 8.857142857142856e-07,
"loss": -1.6837,
"reward": 0.2083333395421505,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.2083333395421505,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 2269.6666870117188,
"epoch": 0.018285714285714287,
"grad_norm": 2.3853516578674316,
"kl": 0.00031375885009765625,
"learning_rate": 9.142857142857142e-07,
"loss": -1.8832,
"reward": 0.3333333432674408,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.3333333432674408,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.018857142857142857,
"grad_norm": 0.5030116438865662,
"kl": 0.00035381317138671875,
"learning_rate": 9.428571428571428e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 2739.6250610351562,
"epoch": 0.019428571428571427,
"grad_norm": 1.6508129835128784,
"kl": 0.00036144256591796875,
"learning_rate": 9.714285714285715e-07,
"loss": -1.8729,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 3151.291748046875,
"epoch": 0.02,
"grad_norm": 0.6652354001998901,
"kl": 0.0004940032958984375,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 2798.291748046875,
"epoch": 0.02057142857142857,
"grad_norm": 1.781134843826294,
"kl": 0.0005702972412109375,
"learning_rate": 9.999776201301742e-07,
"loss": -1.6796,
"reward": 0.1250000037252903,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 3469.8751220703125,
"epoch": 0.021142857142857144,
"grad_norm": 1.4302971363067627,
"kl": 0.000423431396484375,
"learning_rate": 9.999104827467354e-07,
"loss": -1.7207,
"reward": 0.1666666679084301,
"reward_std": 0.23899271339178085,
"rewards/accuracy_reward": 0.1666666679084301,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 3070.75,
"epoch": 0.021714285714285714,
"grad_norm": 1.0685802698135376,
"kl": 0.0008945465087890625,
"learning_rate": 9.997985945275765e-07,
"loss": -1.7185,
"reward": 0.1666666679084301,
"reward_std": 0.23899271339178085,
"rewards/accuracy_reward": 0.1666666679084301,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1992.791748046875,
"epoch": 0.022285714285714287,
"grad_norm": 1.3023139238357544,
"kl": 0.00078582763671875,
"learning_rate": 9.996419666017806e-07,
"loss": -0.9405,
"reward": 0.4166666865348816,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.4166666865348816,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 3451.2083740234375,
"epoch": 0.022857142857142857,
"grad_norm": 0.3098903298377991,
"kl": 0.000820159912109375,
"learning_rate": 9.994406145485149e-07,
"loss": 0.0001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 2855.041748046875,
"epoch": 0.023428571428571427,
"grad_norm": 0.9272823333740234,
"kl": 0.001934051513671875,
"learning_rate": 9.991945583954808e-07,
"loss": -0.7224,
"reward": 0.2916666865348816,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.2916666865348816,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 2162.3334350585938,
"epoch": 0.024,
"grad_norm": 1.3229894638061523,
"kl": 0.00215911865234375,
"learning_rate": 9.989038226169207e-07,
"loss": -1.6771,
"reward": 0.1250000037252903,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 3419.291748046875,
"epoch": 0.02457142857142857,
"grad_norm": 1.2003979682922363,
"kl": 0.00185394287109375,
"learning_rate": 9.985684361311858e-07,
"loss": -1.4837,
"reward": 0.0833333358168602,
"reward_std": 0.20412414520978928,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 2956.9583740234375,
"epoch": 0.025142857142857144,
"grad_norm": 1.1588594913482666,
"kl": 0.00295257568359375,
"learning_rate": 9.981884322978574e-07,
"loss": -1.8687,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 3275.166748046875,
"epoch": 0.025714285714285714,
"grad_norm": 0.5378715991973877,
"kl": 0.00176239013671875,
"learning_rate": 9.977638489144307e-07,
"loss": -0.731,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 2521.20849609375,
"epoch": 0.026285714285714287,
"grad_norm": 1.5500916242599487,
"kl": 0.00409698486328125,
"learning_rate": 9.972947282125533e-07,
"loss": -2.6114,
"reward": 0.2916666716337204,
"reward_std": 0.3602609634399414,
"rewards/accuracy_reward": 0.2916666716337204,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 2619.5416870117188,
"epoch": 0.026857142857142857,
"grad_norm": 0.6222373247146606,
"kl": 0.00214385986328125,
"learning_rate": 9.967811168538266e-07,
"loss": -0.7438,
"reward": 0.4583333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.4583333432674408,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 1774.7500610351562,
"epoch": 0.027428571428571427,
"grad_norm": 2.3108158111572266,
"kl": 0.00504302978515625,
"learning_rate": 9.962230659251635e-07,
"loss": -2.8747,
"reward": 0.375,
"reward_std": 0.395129531621933,
"rewards/accuracy_reward": 0.375,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2151.1666870117188,
"epoch": 0.028,
"grad_norm": 1.405102252960205,
"kl": 0.00498199462890625,
"learning_rate": 9.956206309337066e-07,
"loss": -1.7383,
"reward": 0.1666666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 2918.75,
"epoch": 0.02857142857142857,
"grad_norm": 0.5462047457695007,
"kl": 0.0054779052734375,
"learning_rate": 9.949738718013078e-07,
"loss": -0.7418,
"reward": 0.2916666865348816,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2916666865348816,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 2755.2084350585938,
"epoch": 0.029142857142857144,
"grad_norm": 1.4510318040847778,
"kl": 0.003520965576171875,
"learning_rate": 9.94282852858568e-07,
"loss": -1.4865,
"reward": 0.2500000111758709,
"reward_std": 0.20412414520978928,
"rewards/accuracy_reward": 0.2500000111758709,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 3465.5,
"epoch": 0.029714285714285714,
"grad_norm": 0.6955327987670898,
"kl": 0.003753662109375,
"learning_rate": 9.935476428384382e-07,
"loss": -0.9347,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 3004.291748046875,
"epoch": 0.030285714285714287,
"grad_norm": 1.4090065956115723,
"kl": 0.0052032470703125,
"learning_rate": 9.927683148693833e-07,
"loss": -1.6757,
"reward": 0.1250000037252903,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 3041.541748046875,
"epoch": 0.030857142857142857,
"grad_norm": 0.2869970500469208,
"kl": 0.0038604736328125,
"learning_rate": 9.919449464681074e-07,
"loss": 0.0006,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 3156.916748046875,
"epoch": 0.03142857142857143,
"grad_norm": 0.863756537437439,
"kl": 0.0051727294921875,
"learning_rate": 9.910776195318447e-07,
"loss": -1.6736,
"reward": 0.2083333432674408,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 2776.041748046875,
"epoch": 0.032,
"grad_norm": 0.8829509615898132,
"kl": 0.00750732421875,
"learning_rate": 9.901664203302124e-07,
"loss": -0.7267,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 2872.666748046875,
"epoch": 0.03257142857142857,
"grad_norm": 1.085909366607666,
"kl": 0.0074005126953125,
"learning_rate": 9.89211439496631e-07,
"loss": -2.6113,
"reward": 0.2083333395421505,
"reward_std": 0.3602609634399414,
"rewards/accuracy_reward": 0.2083333395421505,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 3217.875,
"epoch": 0.03314285714285714,
"grad_norm": 0.8099249005317688,
"kl": 0.0081024169921875,
"learning_rate": 9.882127720193078e-07,
"loss": -1.7368,
"reward": 0.1666666679084301,
"reward_std": 0.23899271339178085,
"rewards/accuracy_reward": 0.1666666679084301,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 2047.6250610351562,
"epoch": 0.03371428571428572,
"grad_norm": 0.9201177358627319,
"kl": 0.01129150390625,
"learning_rate": 9.871705172317903e-07,
"loss": -1.4821,
"reward": 0.0833333358168602,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 3198.166748046875,
"epoch": 0.03428571428571429,
"grad_norm": 0.7448273301124573,
"kl": 0.009307861328125,
"learning_rate": 9.86084778803085e-07,
"loss": -1.7306,
"reward": 0.1666666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 2209.9583740234375,
"epoch": 0.03485714285714286,
"grad_norm": 2.11501145362854,
"kl": 0.006805419921875,
"learning_rate": 9.849556647273461e-07,
"loss": -3.6049,
"reward": 0.4166666716337204,
"reward_std": 0.4971916079521179,
"rewards/accuracy_reward": 0.4166666716337204,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 2689.3333740234375,
"epoch": 0.03542857142857143,
"grad_norm": 0.3600199520587921,
"kl": 0.00649261474609375,
"learning_rate": 9.83783287313134e-07,
"loss": 0.001,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 3274.6251220703125,
"epoch": 0.036,
"grad_norm": 0.7515490055084229,
"kl": 0.009490966796875,
"learning_rate": 9.825677631722435e-07,
"loss": -1.4768,
"reward": 0.0833333358168602,
"reward_std": 0.20412415266036987,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 3410.041748046875,
"epoch": 0.036571428571428574,
"grad_norm": 0.2419460266828537,
"kl": 0.006988525390625,
"learning_rate": 9.81309213208106e-07,
"loss": 0.0011,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 3360.58349609375,
"epoch": 0.037142857142857144,
"grad_norm": 1.0176174640655518,
"kl": 0.0145263671875,
"learning_rate": 9.800077626037633e-07,
"loss": -0.9358,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 1987.7084350585938,
"epoch": 0.037714285714285714,
"grad_norm": 1.1186784505844116,
"kl": 0.008026123046875,
"learning_rate": 9.786635408094157e-07,
"loss": -1.7376,
"reward": 0.1666666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 1821.4166870117188,
"epoch": 0.038285714285714284,
"grad_norm": 2.2777140140533447,
"kl": 0.013885498046875,
"learning_rate": 9.772766815295467e-07,
"loss": -2.2118,
"reward": 0.1250000037252903,
"reward_std": 0.3061862140893936,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 3485.416748046875,
"epoch": 0.038857142857142854,
"grad_norm": 1.044973373413086,
"kl": 0.008209228515625,
"learning_rate": 9.758473227096238e-07,
"loss": -2.2271,
"reward": 0.1250000037252903,
"reward_std": 0.306186206638813,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 2774.791748046875,
"epoch": 0.03942857142857143,
"grad_norm": 0.6572657823562622,
"kl": 0.00588226318359375,
"learning_rate": 9.743756065223773e-07,
"loss": -0.9946,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 2589.7083740234375,
"epoch": 0.04,
"grad_norm": 1.0476188659667969,
"kl": 0.011474609375,
"learning_rate": 9.728616793536587e-07,
"loss": -0.9965,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 1670.9166870117188,
"epoch": 0.04057142857142857,
"grad_norm": 0.6323314309120178,
"kl": 0.0037994384765625,
"learning_rate": 9.713056917878816e-07,
"loss": 0.0008,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 2806.416748046875,
"epoch": 0.04114285714285714,
"grad_norm": 0.5533509850502014,
"kl": 0.01007080078125,
"learning_rate": 9.697077985930424e-07,
"loss": -0.7427,
"reward": 0.2083333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 1994.25,
"epoch": 0.04171428571428572,
"grad_norm": 1.0187442302703857,
"kl": 0.0106658935546875,
"learning_rate": 9.68068158705326e-07,
"loss": -0.997,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 2693.125,
"epoch": 0.04228571428571429,
"grad_norm": 0.834760844707489,
"kl": 0.00494384765625,
"learning_rate": 9.663869352132985e-07,
"loss": -1.6737,
"reward": 0.2083333395421505,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.2083333395421505,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 2046.6666870117188,
"epoch": 0.04285714285714286,
"grad_norm": 0.8847272992134094,
"kl": 0.01031494140625,
"learning_rate": 9.646642953416834e-07,
"loss": -0.7426,
"reward": 0.2083333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 3069.7083740234375,
"epoch": 0.04342857142857143,
"grad_norm": 0.8105300664901733,
"kl": 0.0060882568359375,
"learning_rate": 9.6290041043473e-07,
"loss": -1.67,
"reward": 0.1250000037252903,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 2794.1251220703125,
"epoch": 0.044,
"grad_norm": 0.8459436893463135,
"kl": 0.011474609375,
"learning_rate": 9.610954559391704e-07,
"loss": -0.9395,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 2131.7083740234375,
"epoch": 0.044571428571428574,
"grad_norm": 0.9749023914337158,
"kl": 0.00823974609375,
"learning_rate": 9.592496113867668e-07,
"loss": -1.9386,
"reward": 0.5416666716337204,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.5416666716337204,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 2000.5833740234375,
"epoch": 0.045142857142857144,
"grad_norm": 0.6731491684913635,
"kl": 0.0103607177734375,
"learning_rate": 9.573630603764566e-07,
"loss": -0.7392,
"reward": 0.2916666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 3062.0833740234375,
"epoch": 0.045714285714285714,
"grad_norm": 0.9580934643745422,
"kl": 0.0061187744140625,
"learning_rate": 9.554359905560885e-07,
"loss": -0.988,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 1195.0000610351562,
"epoch": 0.046285714285714284,
"grad_norm": 1.316493034362793,
"kl": 0.011627197265625,
"learning_rate": 9.534685936037593e-07,
"loss": -1.9331,
"reward": 0.2916666716337204,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.2916666716337204,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 2899.75,
"epoch": 0.046857142857142854,
"grad_norm": 0.8076637387275696,
"kl": 0.01312255859375,
"learning_rate": 9.514610652087475e-07,
"loss": -1.6822,
"reward": 0.2083333395421505,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.2083333395421505,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 869.2917175292969,
"epoch": 0.04742857142857143,
"grad_norm": 1.114304542541504,
"kl": 0.0088043212890625,
"learning_rate": 9.494136050520494e-07,
"loss": -0.7427,
"reward": 0.4583333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.4583333432674408,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 1685.3334350585938,
"epoch": 0.048,
"grad_norm": 1.8916584253311157,
"kl": 0.0077972412109375,
"learning_rate": 9.473264167865171e-07,
"loss": -2.8129,
"reward": 0.2500000074505806,
"reward_std": 0.3872983306646347,
"rewards/accuracy_reward": 0.2500000074505806,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1882.3750610351562,
"epoch": 0.04857142857142857,
"grad_norm": 0.999433696269989,
"kl": 0.023345947265625,
"learning_rate": 9.451997080166028e-07,
"loss": -0.7365,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 2046.541748046875,
"epoch": 0.04914285714285714,
"grad_norm": 0.9735854268074036,
"kl": 0.0104827880859375,
"learning_rate": 9.430336902777083e-07,
"loss": -2.4278,
"reward": 0.5000000223517418,
"reward_std": 0.3332235813140869,
"rewards/accuracy_reward": 0.5000000223517418,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 2605.2501220703125,
"epoch": 0.04971428571428571,
"grad_norm": 1.1373125314712524,
"kl": 0.019866943359375,
"learning_rate": 9.40828579015145e-07,
"loss": -1.481,
"reward": 0.2500000111758709,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.2500000111758709,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 2085.6251220703125,
"epoch": 0.05028571428571429,
"grad_norm": 1.1215746402740479,
"kl": 0.01055908203125,
"learning_rate": 9.385845935627039e-07,
"loss": -1.6782,
"reward": 0.1250000037252903,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 1928.4166870117188,
"epoch": 0.05085714285714286,
"grad_norm": 1.2779592275619507,
"kl": 0.022705078125,
"learning_rate": 9.363019571208397e-07,
"loss": -1.722,
"reward": 0.1666666679084301,
"reward_std": 0.23899271339178085,
"rewards/accuracy_reward": 0.1666666679084301,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 1391.0833740234375,
"epoch": 0.05142857142857143,
"grad_norm": 1.28791081905365,
"kl": 0.011474609375,
"learning_rate": 9.3398089673447e-07,
"loss": -2.3681,
"reward": 0.4166666865348816,
"reward_std": 0.3332235962152481,
"rewards/accuracy_reward": 0.4166666865348816,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 2919.33349609375,
"epoch": 0.052,
"grad_norm": 0.5967366695404053,
"kl": 0.0152587890625,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0027,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 2806.8333740234375,
"epoch": 0.052571428571428575,
"grad_norm": 0.5494978427886963,
"kl": 0.01953125,
"learning_rate": 9.292244313943176e-07,
"loss": 0.0038,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 1886.0833435058594,
"epoch": 0.053142857142857144,
"grad_norm": 0.9120233654975891,
"kl": 0.014862060546875,
"learning_rate": 9.267894995475355e-07,
"loss": -0.7398,
"reward": 0.2916666865348816,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.2916666865348816,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 2132.8333740234375,
"epoch": 0.053714285714285714,
"grad_norm": 0.9982142448425293,
"kl": 0.01995849609375,
"learning_rate": 9.24317089923191e-07,
"loss": -1.682,
"reward": 0.291666679084301,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.291666679084301,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 1412.0000305175781,
"epoch": 0.054285714285714284,
"grad_norm": 1.1398898363113403,
"kl": 0.009185791015625,
"learning_rate": 9.218074484421977e-07,
"loss": -2.6066,
"reward": 0.2083333358168602,
"reward_std": 0.3602609485387802,
"rewards/accuracy_reward": 0.2083333358168602,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 3135.33349609375,
"epoch": 0.054857142857142854,
"grad_norm": 0.46777933835983276,
"kl": 0.013153076171875,
"learning_rate": 9.192608247287761e-07,
"loss": -0.7412,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 2038.7083435058594,
"epoch": 0.05542857142857143,
"grad_norm": 1.239867925643921,
"kl": 0.014312744140625,
"learning_rate": 9.166774720856253e-07,
"loss": -0.938,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 1562.3333740234375,
"epoch": 0.056,
"grad_norm": 1.027034044265747,
"kl": 0.0085601806640625,
"learning_rate": 9.140576474687263e-07,
"loss": -1.9377,
"reward": 0.5416666716337204,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.5416666716337204,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 2020.0001220703125,
"epoch": 0.05657142857142857,
"grad_norm": 1.366735577583313,
"kl": 0.0066680908203125,
"learning_rate": 9.114016114617857e-07,
"loss": -1.8751,
"reward": 0.5000000149011612,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.5000000149011612,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 1672.7500610351562,
"epoch": 0.05714285714285714,
"grad_norm": 1.17022705078125,
"kl": 0.00823974609375,
"learning_rate": 9.08709628250315e-07,
"loss": -2.4106,
"reward": 0.4166666865348816,
"reward_std": 0.3332235738635063,
"rewards/accuracy_reward": 0.4166666865348816,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 2906.58349609375,
"epoch": 0.05771428571428571,
"grad_norm": 0.629345715045929,
"kl": 0.009490966796875,
"learning_rate": 9.059819655953535e-07,
"loss": -1.6843,
"reward": 0.291666679084301,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.291666679084301,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 1907.1666870117188,
"epoch": 0.05828571428571429,
"grad_norm": 0.8361666202545166,
"kl": 0.01873779296875,
"learning_rate": 9.03218894806835e-07,
"loss": -1.4741,
"reward": 0.2500000111758709,
"reward_std": 0.20412414520978928,
"rewards/accuracy_reward": 0.2500000111758709,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 2738.041748046875,
"epoch": 0.05885714285714286,
"grad_norm": 0.5930432081222534,
"kl": 0.015869140625,
"learning_rate": 9.004206907166023e-07,
"loss": -0.9389,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 3003.791748046875,
"epoch": 0.05942857142857143,
"grad_norm": 0.7205336093902588,
"kl": 0.01336669921875,
"learning_rate": 8.975876316510698e-07,
"loss": -1.8796,
"reward": 0.2500000074505806,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.2500000074505806,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 2454.041748046875,
"epoch": 0.06,
"grad_norm": 0.5209367275238037,
"kl": 0.01300048828125,
"learning_rate": 8.9471999940354e-07,
"loss": -0.9357,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 2740.2500610351562,
"epoch": 0.060571428571428575,
"grad_norm": 1.4679958820343018,
"kl": 0.011962890625,
"learning_rate": 8.918180792061751e-07,
"loss": -1.6703,
"reward": 0.2083333432674408,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 2093.916748046875,
"epoch": 0.061142857142857145,
"grad_norm": 0.57196444272995,
"kl": 0.0166015625,
"learning_rate": 8.88882159701625e-07,
"loss": -0.9368,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 2134.0833740234375,
"epoch": 0.061714285714285715,
"grad_norm": 0.393916517496109,
"kl": 0.010894775390625,
"learning_rate": 8.859125329143175e-07,
"loss": 0.0017,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 2911.541748046875,
"epoch": 0.062285714285714285,
"grad_norm": 0.3331953287124634,
"kl": 0.013275146484375,
"learning_rate": 8.829094942214127e-07,
"loss": -0.738,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 2564.5000610351562,
"epoch": 0.06285714285714286,
"grad_norm": 0.29394540190696716,
"kl": 0.014984130859375,
"learning_rate": 8.798733423234219e-07,
"loss": 0.0024,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 2804.0,
"epoch": 0.06342857142857143,
"grad_norm": 0.7025728225708008,
"kl": 0.017059326171875,
"learning_rate": 8.768043792144968e-07,
"loss": 0.0027,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 2215.0416870117188,
"epoch": 0.064,
"grad_norm": 0.9984528422355652,
"kl": 0.01275634765625,
"learning_rate": 8.737029101523929e-07,
"loss": -1.7326,
"reward": 0.1666666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 2483.2083740234375,
"epoch": 0.06457142857142857,
"grad_norm": 0.7270219326019287,
"kl": 0.010986328125,
"learning_rate": 8.705692436281051e-07,
"loss": -1.8786,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 3233.20849609375,
"epoch": 0.06514285714285714,
"grad_norm": 0.5366232395172119,
"kl": 0.016143798828125,
"learning_rate": 8.674036913351838e-07,
"loss": -0.9371,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 2328.7501220703125,
"epoch": 0.06571428571428571,
"grad_norm": 0.6814904808998108,
"kl": 0.012298583984375,
"learning_rate": 8.642065681387327e-07,
"loss": -0.9397,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 2270.916748046875,
"epoch": 0.06628571428571428,
"grad_norm": 0.7077198624610901,
"kl": 0.0084075927734375,
"learning_rate": 8.609781920440891e-07,
"loss": -1.4859,
"reward": 0.0833333358168602,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 2786.1251220703125,
"epoch": 0.06685714285714285,
"grad_norm": 0.7300746440887451,
"kl": 0.0172119140625,
"learning_rate": 8.57718884165194e-07,
"loss": -1.6803,
"reward": 0.375,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.375,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 1789.4583740234375,
"epoch": 0.06742857142857143,
"grad_norm": 0.5591477751731873,
"kl": 0.01336669921875,
"learning_rate": 8.544289686926524e-07,
"loss": -0.9944,
"reward": 0.375,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.375,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 2085.916748046875,
"epoch": 0.068,
"grad_norm": 0.9207446575164795,
"kl": 0.01312255859375,
"learning_rate": 8.511087728614862e-07,
"loss": -0.7421,
"reward": 0.4583333432674408,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.4583333432674408,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 2560.9583740234375,
"epoch": 0.06857142857142857,
"grad_norm": 0.5242665410041809,
"kl": 0.012298583984375,
"learning_rate": 8.477586269185867e-07,
"loss": 0.0019,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 2857.3333740234375,
"epoch": 0.06914285714285714,
"grad_norm": 0.8910159468650818,
"kl": 0.015045166015625,
"learning_rate": 8.443788640898654e-07,
"loss": -0.9389,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 1371.8750610351562,
"epoch": 0.06971428571428571,
"grad_norm": 1.075481653213501,
"kl": 0.010223388671875,
"learning_rate": 8.409698205471098e-07,
"loss": -1.9938,
"reward": 0.5,
"reward_std": 0.273861289024353,
"rewards/accuracy_reward": 0.5,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 2519.2500610351562,
"epoch": 0.07028571428571428,
"grad_norm": 1.0758017301559448,
"kl": 0.01434326171875,
"learning_rate": 8.37531835374545e-07,
"loss": -2.2018,
"reward": 0.291666679084301,
"reward_std": 0.3061862215399742,
"rewards/accuracy_reward": 0.291666679084301,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 2478.70849609375,
"epoch": 0.07085714285714285,
"grad_norm": 0.8420854210853577,
"kl": 0.017333984375,
"learning_rate": 8.340652505351075e-07,
"loss": -1.6825,
"reward": 0.3750000149011612,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.3750000149011612,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 1974.3333740234375,
"epoch": 0.07142857142857142,
"grad_norm": 0.8553494811058044,
"kl": 0.016143798828125,
"learning_rate": 8.305704108364301e-07,
"loss": -1.9371,
"reward": 0.2083333358168602,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.2083333358168602,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1352.0000305175781,
"epoch": 0.072,
"grad_norm": 0.7503184080123901,
"kl": 0.0130615234375,
"learning_rate": 8.270476638965461e-07,
"loss": -0.7278,
"reward": 0.2916666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2711.20849609375,
"epoch": 0.07257142857142856,
"grad_norm": 0.6893806457519531,
"kl": 0.0093841552734375,
"learning_rate": 8.234973601093135e-07,
"loss": -1.7343,
"reward": 0.4166666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.4166666716337204,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 2195.4583740234375,
"epoch": 0.07314285714285715,
"grad_norm": 0.5396758317947388,
"kl": 0.013031005859375,
"learning_rate": 8.199198526095611e-07,
"loss": -0.9382,
"reward": 0.3333333432674408,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.3333333432674408,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 1736.2916870117188,
"epoch": 0.07371428571428572,
"grad_norm": 1.1960101127624512,
"kl": 0.026214599609375,
"learning_rate": 8.163154972379655e-07,
"loss": -1.7324,
"reward": 0.1666666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 1648.625,
"epoch": 0.07428571428571429,
"grad_norm": 0.7403463125228882,
"kl": 0.009918212890625,
"learning_rate": 8.126846525056555e-07,
"loss": -1.4736,
"reward": 0.3333333544433117,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.3333333544433117,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 2801.8333740234375,
"epoch": 0.07485714285714286,
"grad_norm": 0.7958399057388306,
"kl": 0.0125732421875,
"learning_rate": 8.090276795585531e-07,
"loss": -0.7389,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 2148.2084350585938,
"epoch": 0.07542857142857143,
"grad_norm": 0.6350630521774292,
"kl": 0.0111846923828125,
"learning_rate": 8.053449421414518e-07,
"loss": -1.6738,
"reward": 0.1250000037252903,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 2469.25,
"epoch": 0.076,
"grad_norm": 0.8013907670974731,
"kl": 0.014984130859375,
"learning_rate": 8.01636806561836e-07,
"loss": -1.4861,
"reward": 0.4166666865348816,
"reward_std": 0.20412415266036987,
"rewards/accuracy_reward": 0.4166666865348816,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 3052.5,
"epoch": 0.07657142857142857,
"grad_norm": 0.9900679588317871,
"kl": 0.020294189453125,
"learning_rate": 7.979036416534461e-07,
"loss": -1.8765,
"reward": 0.2500000074505806,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.2500000074505806,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 2824.541748046875,
"epoch": 0.07714285714285714,
"grad_norm": 0.5742546319961548,
"kl": 0.01898193359375,
"learning_rate": 7.941458187395917e-07,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1178.8750305175781,
"epoch": 0.07771428571428571,
"grad_norm": 1.0653953552246094,
"kl": 0.01934814453125,
"learning_rate": 7.903637115962179e-07,
"loss": -0.9936,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 2287.8333740234375,
"epoch": 0.07828571428571429,
"grad_norm": 0.7256926894187927,
"kl": 0.01116943359375,
"learning_rate": 7.86557696414727e-07,
"loss": -1.7341,
"reward": 0.1666666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 2149.666748046875,
"epoch": 0.07885714285714286,
"grad_norm": 0.48477721214294434,
"kl": 0.0179443359375,
"learning_rate": 7.827281517645606e-07,
"loss": 0.003,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 2431.5833740234375,
"epoch": 0.07942857142857143,
"grad_norm": 0.7151182293891907,
"kl": 0.01629638671875,
"learning_rate": 7.788754585555441e-07,
"loss": -0.7297,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 1989.9166870117188,
"epoch": 0.08,
"grad_norm": 0.5723420977592468,
"kl": 0.0123291015625,
"learning_rate": 7.75e-07,
"loss": -0.9392,
"reward": 0.4166666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.4166666716337204,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 1466.0834350585938,
"epoch": 0.08057142857142857,
"grad_norm": 0.8384633660316467,
"kl": 0.012237548828125,
"learning_rate": 7.7110216157463e-07,
"loss": -0.9396,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 1722.1251220703125,
"epoch": 0.08114285714285714,
"grad_norm": 1.3027905225753784,
"kl": 0.0177001953125,
"learning_rate": 7.671823309821749e-07,
"loss": -0.9354,
"reward": 0.3333333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.3333333358168602,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 2374.5834350585938,
"epoch": 0.08171428571428571,
"grad_norm": 0.8892642259597778,
"kl": 0.02044677734375,
"learning_rate": 7.632408981128493e-07,
"loss": -1.924,
"reward": 0.4583333432674408,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.4583333432674408,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 1527.0833740234375,
"epoch": 0.08228571428571428,
"grad_norm": 1.809125542640686,
"kl": 0.0185546875,
"learning_rate": 7.592782550055628e-07,
"loss": -2.7263,
"reward": 0.2916666716337204,
"reward_std": 0.37592336535453796,
"rewards/accuracy_reward": 0.2916666716337204,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 886.1666870117188,
"epoch": 0.08285714285714285,
"grad_norm": 1.7466280460357666,
"kl": 0.01373291015625,
"learning_rate": 7.552947958089233e-07,
"loss": -2.7183,
"reward": 0.2916666679084301,
"reward_std": 0.37592335790395737,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 1453.666748046875,
"epoch": 0.08342857142857144,
"grad_norm": 1.4468218088150024,
"kl": 0.013519287109375,
"learning_rate": 7.512909167420346e-07,
"loss": -2.4829,
"reward": 0.375,
"reward_std": 0.3410547822713852,
"rewards/accuracy_reward": 0.375,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 1603.791748046875,
"epoch": 0.084,
"grad_norm": 1.1846656799316406,
"kl": 0.01263427734375,
"learning_rate": 7.472670160550848e-07,
"loss": -2.6207,
"reward": 0.3750000149011612,
"reward_std": 0.3602609485387802,
"rewards/accuracy_reward": 0.3750000149011612,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 1206.4583740234375,
"epoch": 0.08457142857142858,
"grad_norm": 0.9129040837287903,
"kl": 0.01019287109375,
"learning_rate": 7.432234939897342e-07,
"loss": -1.6655,
"reward": 0.1250000037252903,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 1288.4583740234375,
"epoch": 0.08514285714285715,
"grad_norm": 0.93757164478302,
"kl": 0.01715087890625,
"learning_rate": 7.391607527393044e-07,
"loss": -0.9368,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 1977.041748046875,
"epoch": 0.08571428571428572,
"grad_norm": 1.4448423385620117,
"kl": 0.017486572265625,
"learning_rate": 7.350791964087752e-07,
"loss": -1.6729,
"reward": 0.2083333395421505,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.2083333395421505,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 1958.7917175292969,
"epoch": 0.08628571428571429,
"grad_norm": 2.432656764984131,
"kl": 0.032012939453125,
"learning_rate": 7.309792309745878e-07,
"loss": -0.7388,
"reward": 0.4583333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.4583333432674408,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 2387.1251220703125,
"epoch": 0.08685714285714285,
"grad_norm": 0.5419829487800598,
"kl": 0.016754150390625,
"learning_rate": 7.268612642442656e-07,
"loss": 0.0024,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 2126.3333740234375,
"epoch": 0.08742857142857142,
"grad_norm": 0.5498960018157959,
"kl": 0.01190185546875,
"learning_rate": 7.227257058158502e-07,
"loss": -0.9965,
"reward": 0.375,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.375,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 2022.7501220703125,
"epoch": 0.088,
"grad_norm": 0.857801616191864,
"kl": 0.011138916015625,
"learning_rate": 7.185729670371604e-07,
"loss": -1.6669,
"reward": 0.4583333432674408,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.4583333432674408,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 3400.95849609375,
"epoch": 0.08857142857142856,
"grad_norm": 0.3610871732234955,
"kl": 0.0118408203125,
"learning_rate": 7.144034609648778e-07,
"loss": -0.7363,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1649.166748046875,
"epoch": 0.08914285714285715,
"grad_norm": 1.4170763492584229,
"kl": 0.012237548828125,
"learning_rate": 7.102176023234605e-07,
"loss": -2.4233,
"reward": 0.4166666716337204,
"reward_std": 0.3332235962152481,
"rewards/accuracy_reward": 0.4166666716337204,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 2151.5001220703125,
"epoch": 0.08971428571428572,
"grad_norm": 0.8594135046005249,
"kl": 0.01202392578125,
"learning_rate": 7.060158074638932e-07,
"loss": -0.7423,
"reward": 0.2083333432674408,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 1745.2083740234375,
"epoch": 0.09028571428571429,
"grad_norm": 0.9574673771858215,
"kl": 0.01275634765625,
"learning_rate": 7.017984943222735e-07,
"loss": -2.6742,
"reward": 0.25,
"reward_std": 0.3680921494960785,
"rewards/accuracy_reward": 0.25,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 3289.6251220703125,
"epoch": 0.09085714285714286,
"grad_norm": 0.5987799763679504,
"kl": 0.013763427734375,
"learning_rate": 6.97566082378242e-07,
"loss": -1.4697,
"reward": 0.0833333358168602,
"reward_std": 0.20412414520978928,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 3266.5833740234375,
"epoch": 0.09142857142857143,
"grad_norm": 0.5092036724090576,
"kl": 0.012542724609375,
"learning_rate": 6.93318992613258e-07,
"loss": -0.9381,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 1581.166748046875,
"epoch": 0.092,
"grad_norm": 0.6672082543373108,
"kl": 0.01666259765625,
"learning_rate": 6.890576474687263e-07,
"loss": -0.7416,
"reward": 0.2083333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 2034.916748046875,
"epoch": 0.09257142857142857,
"grad_norm": 1.0506476163864136,
"kl": 0.0162353515625,
"learning_rate": 6.847824708039786e-07,
"loss": -1.9362,
"reward": 0.2916666716337204,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.2916666716337204,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 2280.166748046875,
"epoch": 0.09314285714285714,
"grad_norm": 0.7479243278503418,
"kl": 0.019195556640625,
"learning_rate": 6.804938878541138e-07,
"loss": -0.9382,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 2798.8751220703125,
"epoch": 0.09371428571428571,
"grad_norm": 0.7925019264221191,
"kl": 0.016387939453125,
"learning_rate": 6.761923251877012e-07,
"loss": -1.9367,
"reward": 0.2083333358168602,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.2083333358168602,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 3475.291748046875,
"epoch": 0.09428571428571429,
"grad_norm": 0.44603925943374634,
"kl": 0.01324462890625,
"learning_rate": 6.718782106643523e-07,
"loss": -0.7406,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 3584.0,
"epoch": 0.09485714285714286,
"grad_norm": 0.43778079748153687,
"kl": 0.014923095703125,
"learning_rate": 6.675519733921623e-07,
"loss": 0.0024,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 1407.8333740234375,
"epoch": 0.09542857142857143,
"grad_norm": 1.5409038066864014,
"kl": 0.01873779296875,
"learning_rate": 6.632140436850289e-07,
"loss": -2.9278,
"reward": 0.5833333432674408,
"reward_std": 0.40296071767807007,
"rewards/accuracy_reward": 0.5833333432674408,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 2116.8751220703125,
"epoch": 0.096,
"grad_norm": 1.3273327350616455,
"kl": 0.0230712890625,
"learning_rate": 6.588648530198504e-07,
"loss": -2.4069,
"reward": 0.2500000111758709,
"reward_std": 0.3332235813140869,
"rewards/accuracy_reward": 0.2500000111758709,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 1039.9166870117188,
"epoch": 0.09657142857142857,
"grad_norm": 1.1365933418273926,
"kl": 0.015655517578125,
"learning_rate": 6.545048339936091e-07,
"loss": -2.4803,
"reward": 0.375,
"reward_std": 0.3410547971725464,
"rewards/accuracy_reward": 0.375,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 2980.3333740234375,
"epoch": 0.09714285714285714,
"grad_norm": 0.33093100786209106,
"kl": 0.01751708984375,
"learning_rate": 6.501344202803414e-07,
"loss": -0.7351,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 2490.08349609375,
"epoch": 0.09771428571428571,
"grad_norm": 0.4821685254573822,
"kl": 0.011688232421875,
"learning_rate": 6.45754046588003e-07,
"loss": -0.9394,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 2099.2083740234375,
"epoch": 0.09828571428571428,
"grad_norm": 0.5119174718856812,
"kl": 0.01446533203125,
"learning_rate": 6.413641486152292e-07,
"loss": -0.7392,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 1332.7917175292969,
"epoch": 0.09885714285714285,
"grad_norm": 1.1657651662826538,
"kl": 0.017425537109375,
"learning_rate": 6.36965163007999e-07,
"loss": -2.2233,
"reward": 0.1250000037252903,
"reward_std": 0.3061862215399742,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 2518.7083740234375,
"epoch": 0.09942857142857142,
"grad_norm": 0.38400009274482727,
"kl": 0.02191162109375,
"learning_rate": 6.32557527316202e-07,
"loss": 0.0035,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 1589.9166870117188,
"epoch": 0.1,
"grad_norm": 1.2936434745788574,
"kl": 0.027374267578125,
"learning_rate": 6.281416799501187e-07,
"loss": -1.7382,
"reward": 0.5833333432674408,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.5833333432674408,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1653.9167175292969,
"epoch": 0.10057142857142858,
"grad_norm": 1.3641895055770874,
"kl": 0.02020263671875,
"learning_rate": 6.23718060136812e-07,
"loss": -2.6149,
"reward": 0.291666679084301,
"reward_std": 0.3602609634399414,
"rewards/accuracy_reward": 0.291666679084301,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 2173.9583435058594,
"epoch": 0.10114285714285715,
"grad_norm": 0.7888267636299133,
"kl": 0.016754150390625,
"learning_rate": 6.1928710787644e-07,
"loss": -0.9389,
"reward": 0.4166666865348816,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.4166666865348816,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 3312.25,
"epoch": 0.10171428571428572,
"grad_norm": 0.9001780152320862,
"kl": 0.01397705078125,
"learning_rate": 6.14849263898491e-07,
"loss": -1.876,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 2765.416748046875,
"epoch": 0.10228571428571429,
"grad_norm": 0.5644757747650146,
"kl": 0.015350341796875,
"learning_rate": 6.10404969617945e-07,
"loss": 0.0024,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 2159.08349609375,
"epoch": 0.10285714285714286,
"grad_norm": 1.2660763263702393,
"kl": 0.0147705078125,
"learning_rate": 6.059546670913684e-07,
"loss": -2.7191,
"reward": 0.2916666716337204,
"reward_std": 0.37592336535453796,
"rewards/accuracy_reward": 0.2916666716337204,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 2710.625,
"epoch": 0.10342857142857143,
"grad_norm": 0.7003759145736694,
"kl": 0.02130126953125,
"learning_rate": 6.014987989729444e-07,
"loss": -1.8783,
"reward": 0.2500000074505806,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.2500000074505806,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 2344.625,
"epoch": 0.104,
"grad_norm": 0.6448298096656799,
"kl": 0.0184326171875,
"learning_rate": 5.97037808470444e-07,
"loss": -1.4745,
"reward": 0.0833333358168602,
"reward_std": 0.20412415266036987,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 2700.1251220703125,
"epoch": 0.10457142857142857,
"grad_norm": 0.40715157985687256,
"kl": 0.01531982421875,
"learning_rate": 5.925721393011417e-07,
"loss": -0.7211,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 1704.2501220703125,
"epoch": 0.10514285714285715,
"grad_norm": 1.0119861364364624,
"kl": 0.02203369140625,
"learning_rate": 5.881022356476804e-07,
"loss": -0.7407,
"reward": 0.4583333432674408,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.4583333432674408,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 1675.4166870117188,
"epoch": 0.10571428571428572,
"grad_norm": 2.0721378326416016,
"kl": 0.025970458984375,
"learning_rate": 5.836285421138909e-07,
"loss": -1.4489,
"reward": 0.0833333358168602,
"reward_std": 0.20412414520978928,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 2593.291748046875,
"epoch": 0.10628571428571429,
"grad_norm": 0.6480510234832764,
"kl": 0.0172119140625,
"learning_rate": 5.791515036805684e-07,
"loss": 0.003,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 2512.4583740234375,
"epoch": 0.10685714285714286,
"grad_norm": 0.7385961413383484,
"kl": 0.017333984375,
"learning_rate": 5.74671565661212e-07,
"loss": 0.0028,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 2527.8333740234375,
"epoch": 0.10742857142857143,
"grad_norm": 1.0105268955230713,
"kl": 0.018035888671875,
"learning_rate": 5.701891736577317e-07,
"loss": -2.7295,
"reward": 0.4583333432674408,
"reward_std": 0.37592336535453796,
"rewards/accuracy_reward": 0.4583333432674408,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 3513.25,
"epoch": 0.108,
"grad_norm": 0.6996456384658813,
"kl": 0.01776123046875,
"learning_rate": 5.657047735161255e-07,
"loss": -1.6768,
"reward": 0.2083333432674408,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 2915.916748046875,
"epoch": 0.10857142857142857,
"grad_norm": 0.6052663922309875,
"kl": 0.012054443359375,
"learning_rate": 5.612188112821328e-07,
"loss": -0.7361,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 2475.5001220703125,
"epoch": 0.10914285714285714,
"grad_norm": 0.3194851279258728,
"kl": 0.0223388671875,
"learning_rate": 5.567317331568686e-07,
"loss": 0.0036,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 2002.1250610351562,
"epoch": 0.10971428571428571,
"grad_norm": 45.269439697265625,
"kl": 0.251220703125,
"learning_rate": 5.522439854524411e-07,
"loss": -2.4264,
"reward": 0.3750000149011612,
"reward_std": 0.3410547897219658,
"rewards/accuracy_reward": 0.3750000149011612,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 2904.2083740234375,
"epoch": 0.11028571428571429,
"grad_norm": 0.3201312720775604,
"kl": 0.01861572265625,
"learning_rate": 5.477560145475589e-07,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 1847.7500610351562,
"epoch": 0.11085714285714286,
"grad_norm": 0.9246636629104614,
"kl": 0.0289306640625,
"learning_rate": 5.432682668431314e-07,
"loss": -0.917,
"reward": 0.3333333432674408,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.3333333432674408,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 2270.6666870117188,
"epoch": 0.11142857142857143,
"grad_norm": 0.4476500153541565,
"kl": 0.017730712890625,
"learning_rate": 5.387811887178673e-07,
"loss": 0.0029,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 2440.8333740234375,
"epoch": 0.112,
"grad_norm": 0.5211483836174011,
"kl": 0.011688232421875,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0022,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 2394.9583740234375,
"epoch": 0.11257142857142857,
"grad_norm": 0.533424973487854,
"kl": 0.02685546875,
"learning_rate": 5.298108263422685e-07,
"loss": 0.0048,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 2006.25,
"epoch": 0.11314285714285714,
"grad_norm": 1.4151604175567627,
"kl": 0.019134521484375,
"learning_rate": 5.25328434338788e-07,
"loss": -2.6747,
"reward": 0.3333333432674408,
"reward_std": 0.3680921494960785,
"rewards/accuracy_reward": 0.3333333432674408,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 2482.041748046875,
"epoch": 0.11371428571428571,
"grad_norm": 1.1144918203353882,
"kl": 0.019287109375,
"learning_rate": 5.208484963194316e-07,
"loss": -1.4696,
"reward": 0.2500000111758709,
"reward_std": 0.20412414520978928,
"rewards/accuracy_reward": 0.2500000111758709,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 2661.916748046875,
"epoch": 0.11428571428571428,
"grad_norm": 0.9477536082267761,
"kl": 0.023193359375,
"learning_rate": 5.163714578861091e-07,
"loss": -0.9854,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 2519.8333740234375,
"epoch": 0.11485714285714285,
"grad_norm": 1.073439359664917,
"kl": 0.016082763671875,
"learning_rate": 5.118977643523196e-07,
"loss": -0.7384,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 2459.916748046875,
"epoch": 0.11542857142857142,
"grad_norm": 1.0262060165405273,
"kl": 0.014678955078125,
"learning_rate": 5.074278606988584e-07,
"loss": -1.6796,
"reward": 0.2083333395421505,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.2083333395421505,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 3012.3751220703125,
"epoch": 0.116,
"grad_norm": 0.8095118403434753,
"kl": 0.01739501953125,
"learning_rate": 5.02962191529556e-07,
"loss": -2.6648,
"reward": 0.25,
"reward_std": 0.3680921643972397,
"rewards/accuracy_reward": 0.25,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 2506.3751220703125,
"epoch": 0.11657142857142858,
"grad_norm": 0.5008082985877991,
"kl": 0.013458251953125,
"learning_rate": 4.985012010270557e-07,
"loss": 0.0021,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 2347.7916870117188,
"epoch": 0.11714285714285715,
"grad_norm": 0.6901289820671082,
"kl": 0.01611328125,
"learning_rate": 4.940453329086318e-07,
"loss": -1.4828,
"reward": 0.0833333358168602,
"reward_std": 0.20412415266036987,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 2616.666748046875,
"epoch": 0.11771428571428572,
"grad_norm": 0.631709098815918,
"kl": 0.02020263671875,
"learning_rate": 4.895950303820552e-07,
"loss": -0.9945,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 2875.8333740234375,
"epoch": 0.11828571428571429,
"grad_norm": 0.7697901129722595,
"kl": 0.014129638671875,
"learning_rate": 4.85150736101509e-07,
"loss": -0.9966,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 2224.3751220703125,
"epoch": 0.11885714285714286,
"grad_norm": 0.7268201112747192,
"kl": 0.011077880859375,
"learning_rate": 4.807128921235598e-07,
"loss": -1.4794,
"reward": 0.5,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.5,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 2420.7083740234375,
"epoch": 0.11942857142857143,
"grad_norm": 0.5704336166381836,
"kl": 0.009765625,
"learning_rate": 4.76281939863188e-07,
"loss": 0.0016,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 1993.3333740234375,
"epoch": 0.12,
"grad_norm": 0.507115364074707,
"kl": 0.017822265625,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0029,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 1624.125,
"epoch": 0.12057142857142857,
"grad_norm": 1.2795109748840332,
"kl": 0.009735107421875,
"learning_rate": 4.67442472683798e-07,
"loss": -2.6766,
"reward": 0.3333333432674408,
"reward_std": 0.3680921494960785,
"rewards/accuracy_reward": 0.3333333432674408,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 3554.3333740234375,
"epoch": 0.12114285714285715,
"grad_norm": 0.6074011325836182,
"kl": 0.021728515625,
"learning_rate": 4.6303483699200105e-07,
"loss": -0.9938,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 1680.666748046875,
"epoch": 0.12171428571428572,
"grad_norm": 1.1823277473449707,
"kl": 0.014251708984375,
"learning_rate": 4.5863585138477077e-07,
"loss": -1.7337,
"reward": 0.1666666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 2227.375,
"epoch": 0.12228571428571429,
"grad_norm": 0.7161904573440552,
"kl": 0.01544189453125,
"learning_rate": 4.542459534119971e-07,
"loss": -0.7411,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 2544.7501220703125,
"epoch": 0.12285714285714286,
"grad_norm": 0.9380499720573425,
"kl": 0.01983642578125,
"learning_rate": 4.4986557971965856e-07,
"loss": -0.9942,
"reward": 0.125,
"reward_std": 0.1369306445121765,
"rewards/accuracy_reward": 0.125,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 1188.8333740234375,
"epoch": 0.12342857142857143,
"grad_norm": 1.2638156414031982,
"kl": 0.015777587890625,
"learning_rate": 4.454951660063909e-07,
"loss": -1.6826,
"reward": 0.7916666865348816,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.7916666865348816,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 3525.25,
"epoch": 0.124,
"grad_norm": 0.28554829955101013,
"kl": 0.01068115234375,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0017,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 2003.75,
"epoch": 0.12457142857142857,
"grad_norm": 0.7368922233581543,
"kl": 0.02069091796875,
"learning_rate": 4.367859563149712e-07,
"loss": -0.741,
"reward": 0.2083333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 884.8333435058594,
"epoch": 0.12514285714285714,
"grad_norm": 0.8822182416915894,
"kl": 0.01611328125,
"learning_rate": 4.3244802660783775e-07,
"loss": -0.9384,
"reward": 0.4166666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.4166666716337204,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 2650.75,
"epoch": 0.12571428571428572,
"grad_norm": 0.6846665143966675,
"kl": 0.014801025390625,
"learning_rate": 4.281217893356478e-07,
"loss": -1.6798,
"reward": 0.3750000111758709,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.3750000111758709,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 1293.7916870117188,
"epoch": 0.12628571428571428,
"grad_norm": 0.869556188583374,
"kl": 0.016845703125,
"learning_rate": 4.2380767481229884e-07,
"loss": -0.7405,
"reward": 0.2916666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 1300.9583740234375,
"epoch": 0.12685714285714286,
"grad_norm": 0.937000572681427,
"kl": 0.01593017578125,
"learning_rate": 4.195061121458862e-07,
"loss": -0.939,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 2665.791748046875,
"epoch": 0.12742857142857142,
"grad_norm": 0.794379472732544,
"kl": 0.01953125,
"learning_rate": 4.152175291960214e-07,
"loss": -0.9379,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 2896.08349609375,
"epoch": 0.128,
"grad_norm": 0.8766622543334961,
"kl": 0.0145263671875,
"learning_rate": 4.1094235253127374e-07,
"loss": -2.6164,
"reward": 0.2916666716337204,
"reward_std": 0.3602609485387802,
"rewards/accuracy_reward": 0.2916666716337204,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 2229.8750610351562,
"epoch": 0.12857142857142856,
"grad_norm": 1.1731126308441162,
"kl": 0.016143798828125,
"learning_rate": 4.0668100738674205e-07,
"loss": -2.4688,
"reward": 0.2083333395421505,
"reward_std": 0.3410547971725464,
"rewards/accuracy_reward": 0.2083333395421505,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 1741.8750915527344,
"epoch": 0.12914285714285714,
"grad_norm": 0.7152168154716492,
"kl": 0.01202392578125,
"learning_rate": 4.0243391762175803e-07,
"loss": -0.9392,
"reward": 0.4166666865348816,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.4166666865348816,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 2109.291748046875,
"epoch": 0.12971428571428573,
"grad_norm": 0.4053463339805603,
"kl": 0.0142822265625,
"learning_rate": 3.982015056777265e-07,
"loss": -0.7372,
"reward": 0.2916666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 981.5833740234375,
"epoch": 0.13028571428571428,
"grad_norm": 1.3437594175338745,
"kl": 0.017852783203125,
"learning_rate": 3.939841925361067e-07,
"loss": -1.7393,
"reward": 0.4166666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.4166666716337204,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 2041.2916870117188,
"epoch": 0.13085714285714287,
"grad_norm": 0.44564294815063477,
"kl": 0.01416015625,
"learning_rate": 3.897823976765394e-07,
"loss": -0.7404,
"reward": 0.2916666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 2914.416748046875,
"epoch": 0.13142857142857142,
"grad_norm": 5.425497531890869,
"kl": 0.02838134765625,
"learning_rate": 3.855965390351222e-07,
"loss": -1.8737,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 2574.375,
"epoch": 0.132,
"grad_norm": 1.1059547662734985,
"kl": 0.02020263671875,
"learning_rate": 3.8142703296283953e-07,
"loss": -1.6797,
"reward": 0.2083333432674408,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 2597.041748046875,
"epoch": 0.13257142857142856,
"grad_norm": 0.80033940076828,
"kl": 0.01983642578125,
"learning_rate": 3.772742941841499e-07,
"loss": -1.4814,
"reward": 0.2500000111758709,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.2500000111758709,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 3533.5,
"epoch": 0.13314285714285715,
"grad_norm": 0.4203540086746216,
"kl": 0.015869140625,
"learning_rate": 3.731387357557344e-07,
"loss": 0.0025,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 3225.6251220703125,
"epoch": 0.1337142857142857,
"grad_norm": 0.7065865397453308,
"kl": 0.017547607421875,
"learning_rate": 3.6902076902541214e-07,
"loss": -1.6826,
"reward": 0.2083333432674408,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 1722.6250610351562,
"epoch": 0.13428571428571429,
"grad_norm": 0.5265027284622192,
"kl": 0.013519287109375,
"learning_rate": 3.649208035912249e-07,
"loss": -0.7374,
"reward": 0.2916666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 1470.5000610351562,
"epoch": 0.13485714285714287,
"grad_norm": 0.5969668030738831,
"kl": 0.01806640625,
"learning_rate": 3.608392472606956e-07,
"loss": 0.0026,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 1409.5000610351562,
"epoch": 0.13542857142857143,
"grad_norm": 1.4041366577148438,
"kl": 0.0216064453125,
"learning_rate": 3.5677650601026585e-07,
"loss": -1.4828,
"reward": 0.0833333358168602,
"reward_std": 0.20412415266036987,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 2312.5001220703125,
"epoch": 0.136,
"grad_norm": 1.0515042543411255,
"kl": 0.0201416015625,
"learning_rate": 3.5273298394491515e-07,
"loss": -2.6223,
"reward": 0.291666679084301,
"reward_std": 0.3602609485387802,
"rewards/accuracy_reward": 0.291666679084301,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 2486.9166870117188,
"epoch": 0.13657142857142857,
"grad_norm": 0.8044987916946411,
"kl": 0.02252197265625,
"learning_rate": 3.4870908325796527e-07,
"loss": -2.2171,
"reward": 0.3750000223517418,
"reward_std": 0.306186206638813,
"rewards/accuracy_reward": 0.3750000223517418,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 1590.8751220703125,
"epoch": 0.13714285714285715,
"grad_norm": 0.6538607478141785,
"kl": 0.015533447265625,
"learning_rate": 3.4470520419107664e-07,
"loss": -0.9312,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 2145.8750610351562,
"epoch": 0.1377142857142857,
"grad_norm": 0.8519302010536194,
"kl": 0.02294921875,
"learning_rate": 3.407217449944373e-07,
"loss": -0.9377,
"reward": 0.4166666865348816,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.4166666865348816,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 3469.291748046875,
"epoch": 0.1382857142857143,
"grad_norm": 0.5576246380805969,
"kl": 0.016998291015625,
"learning_rate": 3.367591018871506e-07,
"loss": -0.9348,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 2839.541748046875,
"epoch": 0.13885714285714285,
"grad_norm": 0.41596850752830505,
"kl": 0.01690673828125,
"learning_rate": 3.3281766901782517e-07,
"loss": 0.0027,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 2468.5416870117188,
"epoch": 0.13942857142857143,
"grad_norm": 0.8380830883979797,
"kl": 0.01788330078125,
"learning_rate": 3.2889783842536987e-07,
"loss": -0.9381,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 2339.041748046875,
"epoch": 0.14,
"grad_norm": 1.0777580738067627,
"kl": 0.01507568359375,
"learning_rate": 3.250000000000001e-07,
"loss": -2.5829,
"reward": 0.2083333358168602,
"reward_std": 0.3602609634399414,
"rewards/accuracy_reward": 0.2083333358168602,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 2758.166748046875,
"epoch": 0.14057142857142857,
"grad_norm": 0.8583000302314758,
"kl": 0.015411376953125,
"learning_rate": 3.211245414444559e-07,
"loss": -1.9945,
"reward": 0.25,
"reward_std": 0.273861289024353,
"rewards/accuracy_reward": 0.25,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 2797.291748046875,
"epoch": 0.14114285714285715,
"grad_norm": 1.0498836040496826,
"kl": 0.027252197265625,
"learning_rate": 3.172718482354393e-07,
"loss": -2.6177,
"reward": 0.458333358168602,
"reward_std": 0.3602609485387802,
"rewards/accuracy_reward": 0.458333358168602,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 1560.8750610351562,
"epoch": 0.1417142857142857,
"grad_norm": 0.2750161290168762,
"kl": 0.01739501953125,
"learning_rate": 3.1344230358527284e-07,
"loss": 0.0025,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 2254.375,
"epoch": 0.1422857142857143,
"grad_norm": 0.7589177489280701,
"kl": 0.02679443359375,
"learning_rate": 3.096362884037821e-07,
"loss": -0.7333,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 2698.6251220703125,
"epoch": 0.14285714285714285,
"grad_norm": 0.5587661266326904,
"kl": 0.015472412109375,
"learning_rate": 3.058541812604083e-07,
"loss": -0.7418,
"reward": 0.2083333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 1584.3750610351562,
"epoch": 0.14342857142857143,
"grad_norm": 0.8878784775733948,
"kl": 0.0150146484375,
"learning_rate": 3.020963583465539e-07,
"loss": -1.6822,
"reward": 0.3750000149011612,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.3750000149011612,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 1354.0833435058594,
"epoch": 0.144,
"grad_norm": 1.3565187454223633,
"kl": 0.0235595703125,
"learning_rate": 2.9836319343816397e-07,
"loss": -1.6732,
"reward": 0.125,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.125,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 2829.625,
"epoch": 0.14457142857142857,
"grad_norm": 0.8856632113456726,
"kl": 0.015411376953125,
"learning_rate": 2.946550578585483e-07,
"loss": -0.7394,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 2557.0,
"epoch": 0.14514285714285713,
"grad_norm": 0.5032978653907776,
"kl": 0.01812744140625,
"learning_rate": 2.9097232044144696e-07,
"loss": -1.724,
"reward": 0.1666666679084301,
"reward_std": 0.23899271339178085,
"rewards/accuracy_reward": 0.1666666679084301,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 2025.5001220703125,
"epoch": 0.1457142857142857,
"grad_norm": 0.6513389945030212,
"kl": 0.014007568359375,
"learning_rate": 2.8731534749434464e-07,
"loss": -0.9391,
"reward": 0.4166666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.4166666716337204,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 3475.541748046875,
"epoch": 0.1462857142857143,
"grad_norm": 0.5068712830543518,
"kl": 0.01947021484375,
"learning_rate": 2.836845027620346e-07,
"loss": 0.0031,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 1864.4584350585938,
"epoch": 0.14685714285714285,
"grad_norm": 11.749297142028809,
"kl": 0.2183837890625,
"learning_rate": 2.8008014739043884e-07,
"loss": -1.8304,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 1771.0833740234375,
"epoch": 0.14742857142857144,
"grad_norm": 0.6075973510742188,
"kl": 0.009613037109375,
"learning_rate": 2.765026398906865e-07,
"loss": -1.6771,
"reward": 0.2083333395421505,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.2083333395421505,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 2659.1666870117188,
"epoch": 0.148,
"grad_norm": 0.9356407523155212,
"kl": 0.014862060546875,
"learning_rate": 2.729523361034538e-07,
"loss": -1.6647,
"reward": 0.125,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.125,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 2985.7083740234375,
"epoch": 0.14857142857142858,
"grad_norm": 0.3902454674243927,
"kl": 0.015960693359375,
"learning_rate": 2.6942958916356994e-07,
"loss": -0.9388,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 3379.5833740234375,
"epoch": 0.14914285714285713,
"grad_norm": 0.5188402533531189,
"kl": 0.02239990234375,
"learning_rate": 2.659347494648925e-07,
"loss": -0.7375,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 2401.70849609375,
"epoch": 0.14971428571428572,
"grad_norm": 0.5534473657608032,
"kl": 0.014617919921875,
"learning_rate": 2.6246816462545496e-07,
"loss": -0.9393,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 2452.0833740234375,
"epoch": 0.15028571428571427,
"grad_norm": 0.7840031981468201,
"kl": 0.01806640625,
"learning_rate": 2.5903017945289017e-07,
"loss": -0.9387,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 3485.625,
"epoch": 0.15085714285714286,
"grad_norm": 0.45268696546554565,
"kl": 0.013824462890625,
"learning_rate": 2.5562113591013457e-07,
"loss": 0.0022,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 3424.5001220703125,
"epoch": 0.15142857142857144,
"grad_norm": 0.4016251564025879,
"kl": 0.01983642578125,
"learning_rate": 2.5224137308141336e-07,
"loss": -0.7367,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 2272.375,
"epoch": 0.152,
"grad_norm": 0.6088007092475891,
"kl": 0.0252685546875,
"learning_rate": 2.488912271385139e-07,
"loss": 0.004,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 1409.5833740234375,
"epoch": 0.15257142857142858,
"grad_norm": 0.706413209438324,
"kl": 0.016357421875,
"learning_rate": 2.4557103130734763e-07,
"loss": -0.7323,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 2088.666748046875,
"epoch": 0.15314285714285714,
"grad_norm": 2.1446173191070557,
"kl": 0.019317626953125,
"learning_rate": 2.4228111583480596e-07,
"loss": -1.7378,
"reward": 0.1666666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 2598.20849609375,
"epoch": 0.15371428571428572,
"grad_norm": 0.18762169778347015,
"kl": 0.012420654296875,
"learning_rate": 2.390218079559109e-07,
"loss": 0.002,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 1651.0833740234375,
"epoch": 0.15428571428571428,
"grad_norm": 0.6639404892921448,
"kl": 0.01995849609375,
"learning_rate": 2.3579343186126726e-07,
"loss": -0.7353,
"reward": 0.2916666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 991.2500610351562,
"epoch": 0.15485714285714286,
"grad_norm": 0.9474217891693115,
"kl": 0.0179443359375,
"learning_rate": 2.3259630866481605e-07,
"loss": -1.8744,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 2655.5834350585938,
"epoch": 0.15542857142857142,
"grad_norm": 0.628149688243866,
"kl": 0.017578125,
"learning_rate": 2.294307563718949e-07,
"loss": -0.7414,
"reward": 0.2083333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 2282.3333740234375,
"epoch": 0.156,
"grad_norm": 1.2667988538742065,
"kl": 0.0169677734375,
"learning_rate": 2.2629708984760706e-07,
"loss": -1.6717,
"reward": 0.1250000037252903,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 2919.916748046875,
"epoch": 0.15657142857142858,
"grad_norm": 0.7472139596939087,
"kl": 0.01800537109375,
"learning_rate": 2.2319562078550318e-07,
"loss": -0.9381,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 1767.5000610351562,
"epoch": 0.15714285714285714,
"grad_norm": 1.192237377166748,
"kl": 0.01751708984375,
"learning_rate": 2.2012665767657823e-07,
"loss": -0.7415,
"reward": 0.2083333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 1048.2917175292969,
"epoch": 0.15771428571428572,
"grad_norm": 1.2694878578186035,
"kl": 0.018829345703125,
"learning_rate": 2.1709050577858728e-07,
"loss": -1.8771,
"reward": 0.583333358168602,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.583333358168602,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 2410.416748046875,
"epoch": 0.15828571428571428,
"grad_norm": 1.6138927936553955,
"kl": 0.030670166015625,
"learning_rate": 2.1408746708568242e-07,
"loss": -1.93,
"reward": 0.2083333432674408,
"reward_std": 0.26603007316589355,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 2286.125,
"epoch": 0.15885714285714286,
"grad_norm": 0.7141113877296448,
"kl": 0.014129638671875,
"learning_rate": 2.1111784029837509e-07,
"loss": -0.742,
"reward": 0.2083333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 1842.8750610351562,
"epoch": 0.15942857142857142,
"grad_norm": 0.6313254833221436,
"kl": 0.01605224609375,
"learning_rate": 2.081819207938249e-07,
"loss": -0.939,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 2166.541748046875,
"epoch": 0.16,
"grad_norm": 0.8989630341529846,
"kl": 0.015899658203125,
"learning_rate": 2.0528000059645995e-07,
"loss": -0.9391,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 1435.3750915527344,
"epoch": 0.16057142857142856,
"grad_norm": 0.4465429186820984,
"kl": 0.01531982421875,
"learning_rate": 2.0241236834893028e-07,
"loss": 0.0025,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 2514.5833740234375,
"epoch": 0.16114285714285714,
"grad_norm": 1.1687437295913696,
"kl": 0.02166748046875,
"learning_rate": 1.9957930928339772e-07,
"loss": -1.8717,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 2165.3333740234375,
"epoch": 0.16171428571428573,
"grad_norm": 0.9040661454200745,
"kl": 0.016571044921875,
"learning_rate": 1.96781105193165e-07,
"loss": -0.9278,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 2579.9583740234375,
"epoch": 0.16228571428571428,
"grad_norm": 0.7441158890724182,
"kl": 0.01458740234375,
"learning_rate": 1.9401803440464654e-07,
"loss": -1.6716,
"reward": 0.375,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.375,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 3037.75,
"epoch": 0.16285714285714287,
"grad_norm": 0.6573392152786255,
"kl": 0.015625,
"learning_rate": 1.9129037174968505e-07,
"loss": -0.9305,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 2087.666748046875,
"epoch": 0.16342857142857142,
"grad_norm": 0.9549260139465332,
"kl": 0.017333984375,
"learning_rate": 1.8859838853821435e-07,
"loss": -2.6746,
"reward": 0.2500000074505806,
"reward_std": 0.3680921643972397,
"rewards/accuracy_reward": 0.2500000074505806,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 1758.9166870117188,
"epoch": 0.164,
"grad_norm": 1.3493952751159668,
"kl": 0.0152587890625,
"learning_rate": 1.8594235253127372e-07,
"loss": -2.8729,
"reward": 0.291666679084301,
"reward_std": 0.3951295167207718,
"rewards/accuracy_reward": 0.291666679084301,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 2302.3751220703125,
"epoch": 0.16457142857142856,
"grad_norm": 0.5830644965171814,
"kl": 0.0174560546875,
"learning_rate": 1.8332252791437486e-07,
"loss": -0.7415,
"reward": 0.2083333432674408,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 2104.8333740234375,
"epoch": 0.16514285714285715,
"grad_norm": 0.9054921269416809,
"kl": 0.0169677734375,
"learning_rate": 1.8073917527122385e-07,
"loss": -1.7252,
"reward": 0.1666666716337204,
"reward_std": 0.23899272084236145,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 1549.2500915527344,
"epoch": 0.1657142857142857,
"grad_norm": 1.4962562322616577,
"kl": 0.02374267578125,
"learning_rate": 1.7819255155780238e-07,
"loss": -0.9334,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 2643.3751220703125,
"epoch": 0.1662857142857143,
"grad_norm": 0.8332542777061462,
"kl": 0.020294189453125,
"learning_rate": 1.7568291007680907e-07,
"loss": -1.4634,
"reward": 0.0833333358168602,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 2123.791748046875,
"epoch": 0.16685714285714287,
"grad_norm": 0.7176339030265808,
"kl": 0.0137939453125,
"learning_rate": 1.7321050045246455e-07,
"loss": -1.4564,
"reward": 0.0833333358168602,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 1340.9167175292969,
"epoch": 0.16742857142857143,
"grad_norm": 1.2635085582733154,
"kl": 0.011871337890625,
"learning_rate": 1.7077556860568238e-07,
"loss": -2.4176,
"reward": 0.3333333544433117,
"reward_std": 0.3332235738635063,
"rewards/accuracy_reward": 0.3333333544433117,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 2578.8333740234375,
"epoch": 0.168,
"grad_norm": 0.5623243451118469,
"kl": 0.0145263671875,
"learning_rate": 1.6837835672960831e-07,
"loss": -0.7386,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 2309.2084350585938,
"epoch": 0.16857142857142857,
"grad_norm": 1.792176365852356,
"kl": 0.014251708984375,
"learning_rate": 1.6601910326552998e-07,
"loss": -1.6771,
"reward": 0.1250000037252903,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 2761.2083740234375,
"epoch": 0.16914285714285715,
"grad_norm": 0.5179618000984192,
"kl": 0.0174560546875,
"learning_rate": 1.6369804287916025e-07,
"loss": -0.74,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 2832.916748046875,
"epoch": 0.1697142857142857,
"grad_norm": 1.062676191329956,
"kl": 0.0169677734375,
"learning_rate": 1.6141540643729612e-07,
"loss": -2.6165,
"reward": 0.2916666679084301,
"reward_std": 0.3602609559893608,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 2174.5,
"epoch": 0.1702857142857143,
"grad_norm": 1.0952427387237549,
"kl": 0.015716552734375,
"learning_rate": 1.5917142098485503e-07,
"loss": -1.4785,
"reward": 0.25,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.25,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 1648.291748046875,
"epoch": 0.17085714285714285,
"grad_norm": 0.9061567187309265,
"kl": 0.016937255859375,
"learning_rate": 1.5696630972229166e-07,
"loss": -1.6821,
"reward": 0.2083333395421505,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.2083333395421505,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 2321.9583740234375,
"epoch": 0.17142857142857143,
"grad_norm": 0.47703251242637634,
"kl": 0.01666259765625,
"learning_rate": 1.548002919833971e-07,
"loss": -0.7289,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 2111.7501220703125,
"epoch": 0.172,
"grad_norm": 0.47726908326148987,
"kl": 0.01434326171875,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0023,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 1759.4166870117188,
"epoch": 0.17257142857142857,
"grad_norm": 0.6928355693817139,
"kl": 0.016265869140625,
"learning_rate": 1.5058639494795067e-07,
"loss": -0.9378,
"reward": 0.4166666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.4166666716337204,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 2077.5,
"epoch": 0.17314285714285715,
"grad_norm": 0.6859722137451172,
"kl": 0.0133056640625,
"learning_rate": 1.485389347912525e-07,
"loss": -1.4857,
"reward": 0.2500000111758709,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.2500000111758709,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 2916.2501220703125,
"epoch": 0.1737142857142857,
"grad_norm": 0.9551180601119995,
"kl": 0.01416015625,
"learning_rate": 1.4653140639624066e-07,
"loss": -1.8796,
"reward": 0.1666666716337204,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 2326.8750610351562,
"epoch": 0.1742857142857143,
"grad_norm": 0.44118639826774597,
"kl": 0.0216064453125,
"learning_rate": 1.4456400944391144e-07,
"loss": -0.7264,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 1633.6666870117188,
"epoch": 0.17485714285714285,
"grad_norm": 0.9067421555519104,
"kl": 0.02105712890625,
"learning_rate": 1.4263693962354336e-07,
"loss": -1.4626,
"reward": 0.3333333544433117,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.3333333544433117,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 2906.0833740234375,
"epoch": 0.17542857142857143,
"grad_norm": 0.49431222677230835,
"kl": 0.011474609375,
"learning_rate": 1.4075038861323302e-07,
"loss": -0.7356,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 1410.7500610351562,
"epoch": 0.176,
"grad_norm": 0.8035596609115601,
"kl": 0.01629638671875,
"learning_rate": 1.3890454406082956e-07,
"loss": -0.9359,
"reward": 0.5833333432674408,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.5833333432674408,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 3014.25,
"epoch": 0.17657142857142857,
"grad_norm": 0.2768433392047882,
"kl": 0.01849365234375,
"learning_rate": 1.3709958956526974e-07,
"loss": 0.0029,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 3245.7083740234375,
"epoch": 0.17714285714285713,
"grad_norm": 0.7835389971733093,
"kl": 0.01654052734375,
"learning_rate": 1.353357046583165e-07,
"loss": -2.2183,
"reward": 0.1250000037252903,
"reward_std": 0.3061862140893936,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 2727.3333740234375,
"epoch": 0.1777142857142857,
"grad_norm": 0.48111966252326965,
"kl": 0.009368896484375,
"learning_rate": 1.3361306478670148e-07,
"loss": -0.9302,
"reward": 0.0833333358168602,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 1927.541748046875,
"epoch": 0.1782857142857143,
"grad_norm": 0.5695589184761047,
"kl": 0.02117919921875,
"learning_rate": 1.3193184129467384e-07,
"loss": 0.0034,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 2037.0000610351562,
"epoch": 0.17885714285714285,
"grad_norm": 1.5638245344161987,
"kl": 0.017608642578125,
"learning_rate": 1.3029220140695756e-07,
"loss": -1.6713,
"reward": 0.125,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.125,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 2119.4584350585938,
"epoch": 0.17942857142857144,
"grad_norm": 0.8178718686103821,
"kl": 0.0159912109375,
"learning_rate": 1.2869430821211826e-07,
"loss": -1.6774,
"reward": 0.458333358168602,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.458333358168602,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 2671.666748046875,
"epoch": 0.18,
"grad_norm": 0.7374725341796875,
"kl": 0.012298583984375,
"learning_rate": 1.2713832064634125e-07,
"loss": -1.6682,
"reward": 0.1250000037252903,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 1793.666748046875,
"epoch": 0.18057142857142858,
"grad_norm": 0.8555610775947571,
"kl": 0.0196533203125,
"learning_rate": 1.2562439347762275e-07,
"loss": -0.7343,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 2572.6250610351562,
"epoch": 0.18114285714285713,
"grad_norm": 0.7458478808403015,
"kl": 0.012847900390625,
"learning_rate": 1.2415267729037608e-07,
"loss": -1.6769,
"reward": 0.1250000037252903,
"reward_std": 0.23116151988506317,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 1808.0000610351562,
"epoch": 0.18171428571428572,
"grad_norm": 1.242701530456543,
"kl": 0.017181396484375,
"learning_rate": 1.2272331847045313e-07,
"loss": -2.6198,
"reward": 0.3750000223517418,
"reward_std": 0.3602609485387802,
"rewards/accuracy_reward": 0.3750000223517418,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 2072.875,
"epoch": 0.18228571428571427,
"grad_norm": 0.43984729051589966,
"kl": 0.01568603515625,
"learning_rate": 1.2133645919058418e-07,
"loss": -0.939,
"reward": 0.4166666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.4166666716337204,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 2183.375,
"epoch": 0.18285714285714286,
"grad_norm": 0.7200397849082947,
"kl": 0.017120361328125,
"learning_rate": 1.1999223739623666e-07,
"loss": -1.6787,
"reward": 0.125,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.125,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 3046.916748046875,
"epoch": 0.18342857142857144,
"grad_norm": 0.5004404187202454,
"kl": 0.016571044921875,
"learning_rate": 1.1869078679189393e-07,
"loss": -0.7381,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 1882.75,
"epoch": 0.184,
"grad_norm": 1.3577691316604614,
"kl": 0.0198974609375,
"learning_rate": 1.1743223682775649e-07,
"loss": -3.1577,
"reward": 0.2083333358168602,
"reward_std": 0.43528564274311066,
"rewards/accuracy_reward": 0.2083333358168602,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 2563.2083740234375,
"epoch": 0.18457142857142858,
"grad_norm": 1.3319201469421387,
"kl": 0.023193359375,
"learning_rate": 1.1621671268686605e-07,
"loss": -3.1597,
"reward": 0.291666679084301,
"reward_std": 0.43528565764427185,
"rewards/accuracy_reward": 0.291666679084301,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 2025.666748046875,
"epoch": 0.18514285714285714,
"grad_norm": 0.5463172793388367,
"kl": 0.02020263671875,
"learning_rate": 1.1504433527265378e-07,
"loss": -1.4631,
"reward": 0.0833333358168602,
"reward_std": 0.20412413775920868,
"rewards/accuracy_reward": 0.0833333358168602,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 2767.6251220703125,
"epoch": 0.18571428571428572,
"grad_norm": 0.6291061639785767,
"kl": 0.0208740234375,
"learning_rate": 1.1391522119691496e-07,
"loss": -0.7406,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 1606.8750610351562,
"epoch": 0.18628571428571428,
"grad_norm": 1.8553162813186646,
"kl": 0.02337646484375,
"learning_rate": 1.1282948276820962e-07,
"loss": -1.9341,
"reward": 0.2916666865348816,
"reward_std": 0.26603007316589355,
"rewards/accuracy_reward": 0.2916666865348816,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 2260.25,
"epoch": 0.18685714285714286,
"grad_norm": 0.6469095945358276,
"kl": 0.014312744140625,
"learning_rate": 1.1178722798069215e-07,
"loss": 0.0023,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 2050.666748046875,
"epoch": 0.18742857142857142,
"grad_norm": 0.7297951579093933,
"kl": 0.017822265625,
"learning_rate": 1.10788560503369e-07,
"loss": -0.9389,
"reward": 0.1666666716337204,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.1666666716337204,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 2868.33349609375,
"epoch": 0.188,
"grad_norm": 1.0545990467071533,
"kl": 0.01544189453125,
"learning_rate": 1.0983357966978745e-07,
"loss": -2.73,
"reward": 0.2916666679084301,
"reward_std": 0.37592335790395737,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 2244.7501220703125,
"epoch": 0.18857142857142858,
"grad_norm": 1.0005416870117188,
"kl": 0.017578125,
"learning_rate": 1.0892238046815527e-07,
"loss": -2.2275,
"reward": 0.291666679084301,
"reward_std": 0.3061862140893936,
"rewards/accuracy_reward": 0.291666679084301,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 1698.3333435058594,
"epoch": 0.18914285714285714,
"grad_norm": 0.24079620838165283,
"kl": 0.01593017578125,
"learning_rate": 1.0805505353189254e-07,
"loss": 0.0029,
"reward": 0.25,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.25,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 1608.1667175292969,
"epoch": 0.18971428571428572,
"grad_norm": 0.5765926837921143,
"kl": 0.013397216796875,
"learning_rate": 1.0723168513061665e-07,
"loss": -0.7406,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 2000.416748046875,
"epoch": 0.19028571428571428,
"grad_norm": 0.4461122751235962,
"kl": 0.01611328125,
"learning_rate": 1.0645235716156168e-07,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 2835.75,
"epoch": 0.19085714285714286,
"grad_norm": 0.9358536005020142,
"kl": 0.01904296875,
"learning_rate": 1.0571714714143197e-07,
"loss": -1.6761,
"reward": 0.2083333432674408,
"reward_std": 0.23116150498390198,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 1533.125,
"epoch": 0.19142857142857142,
"grad_norm": 0.58393394947052,
"kl": 0.0191650390625,
"learning_rate": 1.0502612819869216e-07,
"loss": 0.003,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 1860.0833740234375,
"epoch": 0.192,
"grad_norm": 0.9539235234260559,
"kl": 0.0213623046875,
"learning_rate": 1.0437936906629334e-07,
"loss": -2.6119,
"reward": 0.2083333358168602,
"reward_std": 0.3602609485387802,
"rewards/accuracy_reward": 0.2083333358168602,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 2359.9584350585938,
"epoch": 0.19257142857142856,
"grad_norm": 0.591833233833313,
"kl": 0.020263671875,
"learning_rate": 1.0377693407483638e-07,
"loss": -0.9325,
"reward": 0.4166666865348816,
"reward_std": 0.12909944355487823,
"rewards/accuracy_reward": 0.4166666865348816,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 2456.5834350585938,
"epoch": 0.19314285714285714,
"grad_norm": 1.7795339822769165,
"kl": 0.01800537109375,
"learning_rate": 1.032188831461732e-07,
"loss": -3.5502,
"reward": 0.375,
"reward_std": 0.48936039209365845,
"rewards/accuracy_reward": 0.375,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 2746.6251220703125,
"epoch": 0.19371428571428573,
"grad_norm": 0.5191667675971985,
"kl": 0.0203857421875,
"learning_rate": 1.0270527178744664e-07,
"loss": -0.7381,
"reward": 0.0416666679084301,
"reward_std": 0.10206207633018494,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 1982.0000610351562,
"epoch": 0.19428571428571428,
"grad_norm": 0.5445987582206726,
"kl": 0.01641845703125,
"learning_rate": 1.0223615108556937e-07,
"loss": 0.0026,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 1073.125,
"epoch": 0.19485714285714287,
"grad_norm": 1.0652012825012207,
"kl": 0.01904296875,
"learning_rate": 1.0181156770214242e-07,
"loss": -1.9294,
"reward": 0.2083333358168602,
"reward_std": 0.26603008806705475,
"rewards/accuracy_reward": 0.2083333358168602,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 1901.2917175292969,
"epoch": 0.19542857142857142,
"grad_norm": 1.262035846710205,
"kl": 0.016754150390625,
"learning_rate": 1.0143156386881408e-07,
"loss": -1.8793,
"reward": 0.2500000074505806,
"reward_std": 0.25819888710975647,
"rewards/accuracy_reward": 0.2500000074505806,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 1721.8750610351562,
"epoch": 0.196,
"grad_norm": 0.8769494295120239,
"kl": 0.02349853515625,
"learning_rate": 1.0109617738307911e-07,
"loss": -0.7405,
"reward": 0.2083333432674408,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.2083333432674408,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 2183.2084350585938,
"epoch": 0.19657142857142856,
"grad_norm": 2.0577425956726074,
"kl": 0.01995849609375,
"learning_rate": 1.0080544160451918e-07,
"loss": -2.7138,
"reward": 0.2916666679084301,
"reward_std": 0.37592336535453796,
"rewards/accuracy_reward": 0.2916666679084301,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 1418.0833435058594,
"epoch": 0.19714285714285715,
"grad_norm": 0.6003240942955017,
"kl": 0.017822265625,
"learning_rate": 1.0055938545148495e-07,
"loss": 0.0033,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 2902.291748046875,
"epoch": 0.1977142857142857,
"grad_norm": 0.4958335757255554,
"kl": 0.01824951171875,
"learning_rate": 1.0035803339821934e-07,
"loss": -0.7393,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 2797.2083740234375,
"epoch": 0.1982857142857143,
"grad_norm": 0.5800639390945435,
"kl": 0.012176513671875,
"learning_rate": 1.002014054724235e-07,
"loss": -1.6757,
"reward": 0.1250000037252903,
"reward_std": 0.23116151243448257,
"rewards/accuracy_reward": 0.1250000037252903,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 2299.666748046875,
"epoch": 0.19885714285714284,
"grad_norm": 0.43301284313201904,
"kl": 0.02374267578125,
"learning_rate": 1.0008951725326441e-07,
"loss": 0.0038,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 3268.9583740234375,
"epoch": 0.19942857142857143,
"grad_norm": 0.4114702641963959,
"kl": 0.0225830078125,
"learning_rate": 1.0002237986982564e-07,
"loss": -0.7369,
"reward": 0.0416666679084301,
"reward_std": 0.10206206887960434,
"rewards/accuracy_reward": 0.0416666679084301,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 1340.791748046875,
"epoch": 0.2,
"grad_norm": 0.5905561447143555,
"kl": 0.017578125,
"learning_rate": 1e-07,
"loss": 0.0026,
"reward": 0.5,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.5,
"step": 350
},
{
"epoch": 0.2,
"step": 350,
"total_flos": 0.0,
"train_loss": -1.2733016510141806,
"train_runtime": 18317.2078,
"train_samples_per_second": 0.459,
"train_steps_per_second": 0.019
}
],
"logging_steps": 1,
"max_steps": 350,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}