4943 lines
152 KiB
JSON
4943 lines
152 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.2,
|
|
"eval_steps": 500,
|
|
"global_step": 350,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3134.95849609375,
|
|
"epoch": 0.0005714285714285715,
|
|
"grad_norm": 0.6937011480331421,
|
|
"kl": 0.0,
|
|
"learning_rate": 2e-08,
|
|
"loss": -0.7208,
|
|
"reward": 0.27500003203749657,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 1
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2868.9583740234375,
|
|
"epoch": 0.001142857142857143,
|
|
"grad_norm": 1.1340324878692627,
|
|
"kl": 0.0,
|
|
"learning_rate": 4e-08,
|
|
"loss": -0.7386,
|
|
"reward": 0.27500003576278687,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 2
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3083.70849609375,
|
|
"epoch": 0.0017142857142857142,
|
|
"grad_norm": 1.927505373954773,
|
|
"kl": 1.7076730728149414e-05,
|
|
"learning_rate": 6e-08,
|
|
"loss": -2.292,
|
|
"reward": 0.3500000163912773,
|
|
"reward_std": 0.40609321743249893,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.3333333358168602,
|
|
"step": 3
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2651.791748046875,
|
|
"epoch": 0.002285714285714286,
|
|
"grad_norm": 2.0249767303466797,
|
|
"kl": 3.701448440551758e-05,
|
|
"learning_rate": 8e-08,
|
|
"loss": -2.4107,
|
|
"reward": 0.40000002086162567,
|
|
"reward_std": 0.27739381790161133,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.5833333432674408,
|
|
"step": 4
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2761.916748046875,
|
|
"epoch": 0.002857142857142857,
|
|
"grad_norm": 2.0101120471954346,
|
|
"kl": 3.8623809814453125e-05,
|
|
"learning_rate": 1e-07,
|
|
"loss": -0.725,
|
|
"reward": 0.17500000819563866,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 5
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3023.8333740234375,
|
|
"epoch": 0.0034285714285714284,
|
|
"grad_norm": 0.3757542073726654,
|
|
"kl": 3.993511199951172e-05,
|
|
"learning_rate": 1.2e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.15000000596046448,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 6
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3147.75,
|
|
"epoch": 0.004,
|
|
"grad_norm": 1.2157344818115234,
|
|
"kl": 2.2083520889282227e-05,
|
|
"learning_rate": 1.4e-07,
|
|
"loss": -1.6438,
|
|
"reward": 0.15000000596046448,
|
|
"reward_std": 0.19993415474891663,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.2083333432674408,
|
|
"step": 7
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3028.1251220703125,
|
|
"epoch": 0.004571428571428572,
|
|
"grad_norm": 0.7169070243835449,
|
|
"kl": 3.1948089599609375e-05,
|
|
"learning_rate": 1.6e-07,
|
|
"loss": -0.9882,
|
|
"reward": 0.2750000059604645,
|
|
"reward_std": 0.14747881889343262,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 8
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2304.666748046875,
|
|
"epoch": 0.005142857142857143,
|
|
"grad_norm": 0.383722186088562,
|
|
"kl": 3.325939178466797e-05,
|
|
"learning_rate": 1.8e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.4500000327825546,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.2500000111758709,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 9
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3193.625,
|
|
"epoch": 0.005714285714285714,
|
|
"grad_norm": 1.468209147453308,
|
|
"kl": 3.314018249511719e-05,
|
|
"learning_rate": 2e-07,
|
|
"loss": -1.6361,
|
|
"reward": 0.22500000894069672,
|
|
"reward_std": 0.22493848204612732,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 10
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2689.916748046875,
|
|
"epoch": 0.006285714285714286,
|
|
"grad_norm": 1.3963018655776978,
|
|
"kl": 5.7578086853027344e-05,
|
|
"learning_rate": 2.1999999999999998e-07,
|
|
"loss": -0.8156,
|
|
"reward": 0.2500000223517418,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.3333333358168602,
|
|
"step": 11
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2627.541748046875,
|
|
"epoch": 0.006857142857142857,
|
|
"grad_norm": 1.4502061605453491,
|
|
"kl": 3.600120544433594e-05,
|
|
"learning_rate": 2.4e-07,
|
|
"loss": -2.2199,
|
|
"reward": 0.5500000715255737,
|
|
"reward_std": 0.3433297872543335,
|
|
"rewards/accuracy_reward": 0.2916666716337204,
|
|
"rewards/format_reward": 0.6250000298023224,
|
|
"step": 12
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2780.58349609375,
|
|
"epoch": 0.0074285714285714285,
|
|
"grad_norm": 0.4417027235031128,
|
|
"kl": 1.5676021575927734e-05,
|
|
"learning_rate": 2.6e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 13
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2180.0,
|
|
"epoch": 0.008,
|
|
"grad_norm": 1.337963342666626,
|
|
"kl": 4.553794860839844e-05,
|
|
"learning_rate": 2.8e-07,
|
|
"loss": -1.6775,
|
|
"reward": 0.7000000476837158,
|
|
"reward_std": 0.19993416219949722,
|
|
"rewards/accuracy_reward": 0.4583333432674408,
|
|
"rewards/format_reward": 0.7083333730697632,
|
|
"step": 14
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2745.791748046875,
|
|
"epoch": 0.008571428571428572,
|
|
"grad_norm": 2.005941390991211,
|
|
"kl": 3.337860107421875e-05,
|
|
"learning_rate": 3e-07,
|
|
"loss": -1.7196,
|
|
"reward": 0.2500000149011612,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 15
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2953.541748046875,
|
|
"epoch": 0.009142857142857144,
|
|
"grad_norm": 1.1998287439346313,
|
|
"kl": 4.00543212890625e-05,
|
|
"learning_rate": 3.2e-07,
|
|
"loss": -1.4557,
|
|
"reward": 0.42500001192092896,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.5000000298023224,
|
|
"step": 16
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3183.916748046875,
|
|
"epoch": 0.009714285714285713,
|
|
"grad_norm": 3.6435375213623047,
|
|
"kl": 5.614757537841797e-05,
|
|
"learning_rate": 3.4000000000000003e-07,
|
|
"loss": -2.4555,
|
|
"reward": 0.17500000447034836,
|
|
"reward_std": 0.28209254145622253,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.2500000111758709,
|
|
"step": 17
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3134.75,
|
|
"epoch": 0.010285714285714285,
|
|
"grad_norm": 0.2575957179069519,
|
|
"kl": 5.364418029785156e-05,
|
|
"learning_rate": 3.6e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.15000000596046448,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 18
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2953.791748046875,
|
|
"epoch": 0.010857142857142857,
|
|
"grad_norm": 1.4193103313446045,
|
|
"kl": 5.936622619628906e-05,
|
|
"learning_rate": 3.7999999999999996e-07,
|
|
"loss": -0.7348,
|
|
"reward": 0.30000003799796104,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 19
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1856.7083740234375,
|
|
"epoch": 0.011428571428571429,
|
|
"grad_norm": 0.3330537974834442,
|
|
"kl": 2.3543834686279297e-05,
|
|
"learning_rate": 4e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 20
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1932.2501220703125,
|
|
"epoch": 0.012,
|
|
"grad_norm": 2.0459184646606445,
|
|
"kl": 3.30805778503418e-05,
|
|
"learning_rate": 4.1999999999999995e-07,
|
|
"loss": -1.7473,
|
|
"reward": 0.5250000357627869,
|
|
"reward_std": 0.24647516012191772,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 21
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3040.416748046875,
|
|
"epoch": 0.012571428571428572,
|
|
"grad_norm": 2.663806676864624,
|
|
"kl": 4.1604042053222656e-05,
|
|
"learning_rate": 4.3999999999999997e-07,
|
|
"loss": -3.5218,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.5641850829124451,
|
|
"rewards/accuracy_reward": 0.4583333432674408,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 22
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2432.791748046875,
|
|
"epoch": 0.013142857142857144,
|
|
"grad_norm": 1.2767139673233032,
|
|
"kl": 4.3511390686035156e-05,
|
|
"learning_rate": 4.6e-07,
|
|
"loss": -0.9799,
|
|
"reward": 0.4500000327825546,
|
|
"reward_std": 0.20871607959270477,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.625,
|
|
"step": 23
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2564.2083740234375,
|
|
"epoch": 0.013714285714285714,
|
|
"grad_norm": 0.6577972173690796,
|
|
"kl": 4.863739013671875e-05,
|
|
"learning_rate": 4.8e-07,
|
|
"loss": -0.7097,
|
|
"reward": 0.5250000357627869,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.3333333358168602,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 24
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1678.4167175292969,
|
|
"epoch": 0.014285714285714285,
|
|
"grad_norm": 0.5228314995765686,
|
|
"kl": 4.172325134277344e-05,
|
|
"learning_rate": 5e-07,
|
|
"loss": -0.4805,
|
|
"reward": 0.6250000298023224,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.2500000111758709,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 25
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2865.6251220703125,
|
|
"epoch": 0.014857142857142857,
|
|
"grad_norm": 1.137710690498352,
|
|
"kl": 5.5909156799316406e-05,
|
|
"learning_rate": 5.2e-07,
|
|
"loss": -0.9844,
|
|
"reward": 0.3750000149011612,
|
|
"reward_std": 0.24647516012191772,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 26
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3516.375,
|
|
"epoch": 0.015428571428571429,
|
|
"grad_norm": 2.815162420272827,
|
|
"kl": 7.021427154541016e-05,
|
|
"learning_rate": 5.4e-07,
|
|
"loss": -2.2754,
|
|
"reward": 0.1250000111758709,
|
|
"reward_std": 0.2479735016822815,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.1666666679084301,
|
|
"step": 27
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2613.416748046875,
|
|
"epoch": 0.016,
|
|
"grad_norm": 1.428934931755066,
|
|
"kl": 7.724761962890625e-05,
|
|
"learning_rate": 5.6e-07,
|
|
"loss": -1.9051,
|
|
"reward": 0.42500002682209015,
|
|
"reward_std": 0.323934830725193,
|
|
"rewards/accuracy_reward": 0.2083333358168602,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 28
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2780.9583740234375,
|
|
"epoch": 0.01657142857142857,
|
|
"grad_norm": 1.547488808631897,
|
|
"kl": 7.271766662597656e-05,
|
|
"learning_rate": 5.8e-07,
|
|
"loss": -1.3492,
|
|
"reward": 0.30000003427267075,
|
|
"reward_std": 0.32240864634513855,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.3333333544433117,
|
|
"step": 29
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3566.375,
|
|
"epoch": 0.017142857142857144,
|
|
"grad_norm": 1.6621514558792114,
|
|
"kl": 0.000118255615234375,
|
|
"learning_rate": 6e-07,
|
|
"loss": -1.2378,
|
|
"reward": 0.15000000596046448,
|
|
"reward_std": 0.27739381790161133,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.125,
|
|
"step": 30
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2331.5001220703125,
|
|
"epoch": 0.017714285714285714,
|
|
"grad_norm": 1.5594860315322876,
|
|
"kl": 0.00010943412780761719,
|
|
"learning_rate": 6.2e-07,
|
|
"loss": -1.4703,
|
|
"reward": 0.40000002086162567,
|
|
"reward_std": 0.2773938253521919,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.458333358168602,
|
|
"step": 31
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2168.8751220703125,
|
|
"epoch": 0.018285714285714287,
|
|
"grad_norm": 0.7033916711807251,
|
|
"kl": 0.00013780593872070312,
|
|
"learning_rate": 6.4e-07,
|
|
"loss": -0.678,
|
|
"reward": 0.6250000298023224,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.5,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 32
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3584.0,
|
|
"epoch": 0.018857142857142857,
|
|
"grad_norm": 0.45104363560676575,
|
|
"kl": 0.0001506805419921875,
|
|
"learning_rate": 6.6e-07,
|
|
"loss": 0.0,
|
|
"reward": 0.0,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.0,
|
|
"step": 33
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2776.4583740234375,
|
|
"epoch": 0.019428571428571427,
|
|
"grad_norm": 0.6990403532981873,
|
|
"kl": 0.00020122528076171875,
|
|
"learning_rate": 6.800000000000001e-07,
|
|
"loss": -0.9026,
|
|
"reward": 0.5500000044703484,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.5833333358168602,
|
|
"step": 34
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3161.8333740234375,
|
|
"epoch": 0.02,
|
|
"grad_norm": 2.173389434814453,
|
|
"kl": 0.0002646446228027344,
|
|
"learning_rate": 7e-07,
|
|
"loss": -1.6205,
|
|
"reward": 0.17500000447034836,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.291666679084301,
|
|
"step": 35
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2875.33349609375,
|
|
"epoch": 0.02057142857142857,
|
|
"grad_norm": 2.574173927307129,
|
|
"kl": 0.0003509521484375,
|
|
"learning_rate": 7.2e-07,
|
|
"loss": -2.5022,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.36425092816352844,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 36
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3408.8333740234375,
|
|
"epoch": 0.021142857142857144,
|
|
"grad_norm": 1.749144434928894,
|
|
"kl": 0.000339508056640625,
|
|
"learning_rate": 7.4e-07,
|
|
"loss": -2.3526,
|
|
"reward": 0.2500000149011612,
|
|
"reward_std": 0.40926575660705566,
|
|
"rewards/accuracy_reward": 0.2083333358168602,
|
|
"rewards/format_reward": 0.2083333358168602,
|
|
"step": 37
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2987.5001220703125,
|
|
"epoch": 0.021714285714285714,
|
|
"grad_norm": 1.8244866132736206,
|
|
"kl": 0.000728607177734375,
|
|
"learning_rate": 7.599999999999999e-07,
|
|
"loss": -1.8345,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.2683281749486923,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 38
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1947.7083740234375,
|
|
"epoch": 0.022285714285714287,
|
|
"grad_norm": 1.6700297594070435,
|
|
"kl": 0.000614166259765625,
|
|
"learning_rate": 7.799999999999999e-07,
|
|
"loss": -2.2225,
|
|
"reward": 0.8000000715255737,
|
|
"reward_std": 0.24494898319244385,
|
|
"rewards/accuracy_reward": 0.4583333432674408,
|
|
"rewards/format_reward": 0.8750000298023224,
|
|
"step": 39
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3358.75,
|
|
"epoch": 0.022857142857142857,
|
|
"grad_norm": 0.8734477758407593,
|
|
"kl": 0.000514984130859375,
|
|
"learning_rate": 8e-07,
|
|
"loss": -1.4618,
|
|
"reward": 0.10000000894069672,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.1666666716337204,
|
|
"step": 40
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2962.7083740234375,
|
|
"epoch": 0.023428571428571427,
|
|
"grad_norm": 1.2087616920471191,
|
|
"kl": 0.00079345703125,
|
|
"learning_rate": 8.199999999999999e-07,
|
|
"loss": -0.7151,
|
|
"reward": 0.32500001788139343,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.2916666865348816,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 41
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2246.0834350585938,
|
|
"epoch": 0.024,
|
|
"grad_norm": 0.39726927876472473,
|
|
"kl": 0.0006685256958007812,
|
|
"learning_rate": 8.399999999999999e-07,
|
|
"loss": 0.0001,
|
|
"reward": 0.3500000238418579,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 42
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3212.7501220703125,
|
|
"epoch": 0.02457142857142857,
|
|
"grad_norm": 1.559288740158081,
|
|
"kl": 0.000598907470703125,
|
|
"learning_rate": 8.599999999999999e-07,
|
|
"loss": -1.7896,
|
|
"reward": 0.20000001788139343,
|
|
"reward_std": 0.2323790118098259,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.2500000074505806,
|
|
"step": 43
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3197.5001220703125,
|
|
"epoch": 0.025142857142857144,
|
|
"grad_norm": 1.7089260816574097,
|
|
"kl": 0.0013065338134765625,
|
|
"learning_rate": 8.799999999999999e-07,
|
|
"loss": -2.2349,
|
|
"reward": 0.40000003576278687,
|
|
"reward_std": 0.36088940501213074,
|
|
"rewards/accuracy_reward": 0.2916666679084301,
|
|
"rewards/format_reward": 0.3750000149011612,
|
|
"step": 44
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3498.95849609375,
|
|
"epoch": 0.025714285714285714,
|
|
"grad_norm": 1.1292223930358887,
|
|
"kl": 0.0009365081787109375,
|
|
"learning_rate": 9e-07,
|
|
"loss": -1.8457,
|
|
"reward": 0.1250000074505806,
|
|
"reward_std": 0.1596180573105812,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.2083333358168602,
|
|
"step": 45
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2446.4583740234375,
|
|
"epoch": 0.026285714285714287,
|
|
"grad_norm": 0.8362674713134766,
|
|
"kl": 0.002166748046875,
|
|
"learning_rate": 9.2e-07,
|
|
"loss": -0.9237,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.21162375062704086,
|
|
"rewards/accuracy_reward": 0.2500000074505806,
|
|
"rewards/format_reward": 0.6666666865348816,
|
|
"step": 46
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2658.0,
|
|
"epoch": 0.026857142857142857,
|
|
"grad_norm": 1.1083022356033325,
|
|
"kl": 0.001728057861328125,
|
|
"learning_rate": 9.399999999999999e-07,
|
|
"loss": -1.7675,
|
|
"reward": 0.5250000059604645,
|
|
"reward_std": 0.3030136823654175,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 47
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1924.25,
|
|
"epoch": 0.027428571428571427,
|
|
"grad_norm": 1.7161272764205933,
|
|
"kl": 0.01171112060546875,
|
|
"learning_rate": 9.6e-07,
|
|
"loss": 0.0019,
|
|
"reward": 0.6000000089406967,
|
|
"reward_std": 0.22085529565811157,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 48
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2205.7500610351562,
|
|
"epoch": 0.028,
|
|
"grad_norm": 1.093712329864502,
|
|
"kl": 0.003936767578125,
|
|
"learning_rate": 9.8e-07,
|
|
"loss": -0.6857,
|
|
"reward": 0.4000000022351742,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.5416666679084301,
|
|
"step": 49
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3085.291748046875,
|
|
"epoch": 0.02857142857142857,
|
|
"grad_norm": 1.334791660308838,
|
|
"kl": 0.00530242919921875,
|
|
"learning_rate": 1e-06,
|
|
"loss": -2.762,
|
|
"reward": 0.2750000134110451,
|
|
"reward_std": 0.39242780208587646,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.291666679084301,
|
|
"step": 50
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2834.2083740234375,
|
|
"epoch": 0.029142857142857144,
|
|
"grad_norm": 0.6573655605316162,
|
|
"kl": 0.0026569366455078125,
|
|
"learning_rate": 9.999890338174275e-07,
|
|
"loss": 0.0004,
|
|
"reward": 0.27500003576278687,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 51
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3296.791748046875,
|
|
"epoch": 0.029714285714285714,
|
|
"grad_norm": 1.4219310283660889,
|
|
"kl": 0.002716064453125,
|
|
"learning_rate": 9.999561358041868e-07,
|
|
"loss": -2.8224,
|
|
"reward": 0.40000003576278687,
|
|
"reward_std": 0.38455653190612793,
|
|
"rewards/accuracy_reward": 0.2500000074505806,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 52
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2921.291748046875,
|
|
"epoch": 0.030285714285714287,
|
|
"grad_norm": 1.2512001991271973,
|
|
"kl": 0.0057525634765625,
|
|
"learning_rate": 9.999013075636804e-07,
|
|
"loss": 0.0009,
|
|
"reward": 0.25,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 53
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2937.875,
|
|
"epoch": 0.030857142857142857,
|
|
"grad_norm": 0.24358271062374115,
|
|
"kl": 0.00394439697265625,
|
|
"learning_rate": 9.998245517681593e-07,
|
|
"loss": 0.0006,
|
|
"reward": 0.15000000596046448,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 54
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3225.041748046875,
|
|
"epoch": 0.03142857142857143,
|
|
"grad_norm": 0.5058190226554871,
|
|
"kl": 0.0050811767578125,
|
|
"learning_rate": 9.997258721585931e-07,
|
|
"loss": -0.4553,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.19993415474891663,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.2916666865348816,
|
|
"step": 55
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2791.33349609375,
|
|
"epoch": 0.032,
|
|
"grad_norm": 1.2000298500061035,
|
|
"kl": 0.0060272216796875,
|
|
"learning_rate": 9.996052735444862e-07,
|
|
"loss": -1.9483,
|
|
"reward": 0.30000003799796104,
|
|
"reward_std": 0.2173428237438202,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.4583333544433117,
|
|
"step": 56
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2855.5,
|
|
"epoch": 0.03257142857142857,
|
|
"grad_norm": 2.375877618789673,
|
|
"kl": 0.007659912109375,
|
|
"learning_rate": 9.994627618036452e-07,
|
|
"loss": -3.1453,
|
|
"reward": 0.40000002086162567,
|
|
"reward_std": 0.4469800293445587,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.5000000149011612,
|
|
"step": 57
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3482.75,
|
|
"epoch": 0.03314285714285714,
|
|
"grad_norm": 1.0685774087905884,
|
|
"kl": 0.005950927734375,
|
|
"learning_rate": 9.992983438818915e-07,
|
|
"loss": -1.8528,
|
|
"reward": 0.1250000111758709,
|
|
"reward_std": 0.2479735016822815,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.1666666679084301,
|
|
"step": 58
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2283.0834350585938,
|
|
"epoch": 0.03371428571428572,
|
|
"grad_norm": 1.0396032333374023,
|
|
"kl": 0.008758544921875,
|
|
"learning_rate": 9.991120277927223e-07,
|
|
"loss": -1.6818,
|
|
"reward": 0.42500001192092896,
|
|
"reward_std": 0.21615658700466156,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.6250000298023224,
|
|
"step": 59
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3297.45849609375,
|
|
"epoch": 0.03428571428571429,
|
|
"grad_norm": 1.2367874383926392,
|
|
"kl": 0.006866455078125,
|
|
"learning_rate": 9.989038226169207e-07,
|
|
"loss": -3.072,
|
|
"reward": 0.27500002086162567,
|
|
"reward_std": 0.3798578232526779,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.3750000149011612,
|
|
"step": 60
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2158.541748046875,
|
|
"epoch": 0.03485714285714286,
|
|
"grad_norm": 0.9891504049301147,
|
|
"kl": 0.0059051513671875,
|
|
"learning_rate": 9.98673738502114e-07,
|
|
"loss": -1.8628,
|
|
"reward": 0.7249999940395355,
|
|
"reward_std": 0.4160907417535782,
|
|
"rewards/accuracy_reward": 0.5000000298023224,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 61
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2971.291748046875,
|
|
"epoch": 0.03542857142857143,
|
|
"grad_norm": 0.7076906561851501,
|
|
"kl": 0.004241943359375,
|
|
"learning_rate": 9.98421786662277e-07,
|
|
"loss": -0.5466,
|
|
"reward": 0.17500000819563866,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 62
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3438.25,
|
|
"epoch": 0.036,
|
|
"grad_norm": 0.8365570902824402,
|
|
"kl": 0.0075531005859375,
|
|
"learning_rate": 9.981479793771866e-07,
|
|
"loss": -1.6168,
|
|
"reward": 0.17500000447034836,
|
|
"reward_std": 0.26995331048965454,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.1666666679084301,
|
|
"step": 63
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3380.7083740234375,
|
|
"epoch": 0.036571428571428574,
|
|
"grad_norm": 0.727074384689331,
|
|
"kl": 0.00412750244140625,
|
|
"learning_rate": 9.97852329991824e-07,
|
|
"loss": -0.9846,
|
|
"reward": 0.07500000298023224,
|
|
"reward_std": 0.08215838670730591,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.125,
|
|
"step": 64
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3198.916748046875,
|
|
"epoch": 0.037142857142857144,
|
|
"grad_norm": 0.9209883809089661,
|
|
"kl": 0.0122222900390625,
|
|
"learning_rate": 9.975348529157229e-07,
|
|
"loss": -1.6992,
|
|
"reward": 0.22500000894069672,
|
|
"reward_std": 0.21632246673107147,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.2916666716337204,
|
|
"step": 65
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2440.08349609375,
|
|
"epoch": 0.037714285714285714,
|
|
"grad_norm": 1.0546424388885498,
|
|
"kl": 0.00701904296875,
|
|
"learning_rate": 9.971955636222684e-07,
|
|
"loss": -2.3799,
|
|
"reward": 0.550000011920929,
|
|
"reward_std": 0.3548535108566284,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.7500000298023224,
|
|
"step": 66
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2116.5,
|
|
"epoch": 0.038285714285714284,
|
|
"grad_norm": 1.3037738800048828,
|
|
"kl": 0.01519775390625,
|
|
"learning_rate": 9.968344786479415e-07,
|
|
"loss": -1.6571,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.24978766590356827,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.625,
|
|
"step": 67
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3452.666748046875,
|
|
"epoch": 0.038857142857142854,
|
|
"grad_norm": 0.39050254225730896,
|
|
"kl": 0.00258636474609375,
|
|
"learning_rate": 9.964516155915151e-07,
|
|
"loss": -0.6092,
|
|
"reward": 0.07500000298023224,
|
|
"reward_std": 0.12549901008605957,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.0833333358168602,
|
|
"step": 68
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3062.0,
|
|
"epoch": 0.03942857142857143,
|
|
"grad_norm": 0.89232337474823,
|
|
"kl": 0.00434112548828125,
|
|
"learning_rate": 9.960469931131936e-07,
|
|
"loss": -1.2788,
|
|
"reward": 0.30000000447034836,
|
|
"reward_std": 0.2658701241016388,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.3333333544433117,
|
|
"step": 69
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2577.166748046875,
|
|
"epoch": 0.04,
|
|
"grad_norm": 1.3402986526489258,
|
|
"kl": 0.0091552734375,
|
|
"learning_rate": 9.956206309337066e-07,
|
|
"loss": -1.5284,
|
|
"reward": 0.42500000447034836,
|
|
"reward_std": 0.20463287830352783,
|
|
"rewards/accuracy_reward": 0.2916666679084301,
|
|
"rewards/format_reward": 0.4166666679084301,
|
|
"step": 70
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1986.875,
|
|
"epoch": 0.04057142857142857,
|
|
"grad_norm": 1.0082634687423706,
|
|
"kl": 0.004364013671875,
|
|
"learning_rate": 9.951725498333448e-07,
|
|
"loss": -1.6703,
|
|
"reward": 0.6500000059604645,
|
|
"reward_std": 0.19993415474891663,
|
|
"rewards/accuracy_reward": 0.4583333432674408,
|
|
"rewards/format_reward": 0.625,
|
|
"step": 71
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2902.125,
|
|
"epoch": 0.04114285714285714,
|
|
"grad_norm": 0.5259437561035156,
|
|
"kl": 0.008880615234375,
|
|
"learning_rate": 9.947027716509488e-07,
|
|
"loss": -0.8052,
|
|
"reward": 0.3500000163912773,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.3333333358168602,
|
|
"step": 72
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2200.7500610351562,
|
|
"epoch": 0.04171428571428572,
|
|
"grad_norm": 0.7980517745018005,
|
|
"kl": 0.01031494140625,
|
|
"learning_rate": 9.942113192828444e-07,
|
|
"loss": -0.9314,
|
|
"reward": 0.5,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.6666666865348816,
|
|
"step": 73
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2764.2083740234375,
|
|
"epoch": 0.04228571428571429,
|
|
"grad_norm": 1.3613859415054321,
|
|
"kl": 0.00328826904296875,
|
|
"learning_rate": 9.93698216681727e-07,
|
|
"loss": -1.641,
|
|
"reward": 0.3500000163912773,
|
|
"reward_std": 0.28679126501083374,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.4166666679084301,
|
|
"step": 74
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2020.166748046875,
|
|
"epoch": 0.04285714285714286,
|
|
"grad_norm": 1.677713394165039,
|
|
"kl": 0.015411376953125,
|
|
"learning_rate": 9.931634888554935e-07,
|
|
"loss": -1.892,
|
|
"reward": 0.5500000715255737,
|
|
"reward_std": 0.29662763327360153,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 75
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3099.291748046875,
|
|
"epoch": 0.04342857142857143,
|
|
"grad_norm": 0.7256157398223877,
|
|
"kl": 0.0051727294921875,
|
|
"learning_rate": 9.926071618660237e-07,
|
|
"loss": -1.579,
|
|
"reward": 0.2750000059604645,
|
|
"reward_std": 0.22555401921272278,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 76
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3111.20849609375,
|
|
"epoch": 0.044,
|
|
"grad_norm": 0.5698821544647217,
|
|
"kl": 0.011016845703125,
|
|
"learning_rate": 9.9202926282791e-07,
|
|
"loss": 0.0018,
|
|
"reward": 0.42500004172325134,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 77
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2611.541748046875,
|
|
"epoch": 0.044571428571428574,
|
|
"grad_norm": 1.2385872602462769,
|
|
"kl": 0.009613037109375,
|
|
"learning_rate": 9.91429819907136e-07,
|
|
"loss": -2.4486,
|
|
"reward": 0.5750000178813934,
|
|
"reward_std": 0.42487265169620514,
|
|
"rewards/accuracy_reward": 0.4166666716337204,
|
|
"rewards/format_reward": 0.5416666716337204,
|
|
"step": 78
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2208.3751220703125,
|
|
"epoch": 0.045142857142857144,
|
|
"grad_norm": 0.9444563388824463,
|
|
"kl": 0.011566162109375,
|
|
"learning_rate": 9.908088623197048e-07,
|
|
"loss": -2.1262,
|
|
"reward": 0.550000011920929,
|
|
"reward_std": 0.29662764072418213,
|
|
"rewards/accuracy_reward": 0.2500000111758709,
|
|
"rewards/format_reward": 0.6666666865348816,
|
|
"step": 79
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3514.791748046875,
|
|
"epoch": 0.045714285714285714,
|
|
"grad_norm": 0.6574345231056213,
|
|
"kl": 0.0064697265625,
|
|
"learning_rate": 9.901664203302124e-07,
|
|
"loss": -0.9746,
|
|
"reward": 0.07500000298023224,
|
|
"reward_std": 0.08215838670730591,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.125,
|
|
"step": 80
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2035.7501220703125,
|
|
"epoch": 0.046285714285714284,
|
|
"grad_norm": 1.1667371988296509,
|
|
"kl": 0.02191162109375,
|
|
"learning_rate": 9.895025252503755e-07,
|
|
"loss": -1.7981,
|
|
"reward": 0.6500000059604645,
|
|
"reward_std": 0.28908342123031616,
|
|
"rewards/accuracy_reward": 0.2500000074505806,
|
|
"rewards/format_reward": 0.8333333730697632,
|
|
"step": 81
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2879.2083740234375,
|
|
"epoch": 0.046857142857142854,
|
|
"grad_norm": 0.5121353268623352,
|
|
"kl": 0.009796142578125,
|
|
"learning_rate": 9.888172094375033e-07,
|
|
"loss": -0.8266,
|
|
"reward": 0.25,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 82
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1211.4166870117188,
|
|
"epoch": 0.04742857142857143,
|
|
"grad_norm": 0.9632378816604614,
|
|
"kl": 0.01214599609375,
|
|
"learning_rate": 9.881105062929221e-07,
|
|
"loss": -0.5763,
|
|
"reward": 0.7250000536441803,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.4166666716337204,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 83
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1922.916748046875,
|
|
"epoch": 0.048,
|
|
"grad_norm": 1.0117182731628418,
|
|
"kl": 0.009674072265625,
|
|
"learning_rate": 9.873824502603459e-07,
|
|
"loss": -0.7207,
|
|
"reward": 0.6000000536441803,
|
|
"reward_std": 0.27253396064043045,
|
|
"rewards/accuracy_reward": 0.291666679084301,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 84
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2529.875,
|
|
"epoch": 0.04857142857142857,
|
|
"grad_norm": 1.2297358512878418,
|
|
"kl": 0.023101806640625,
|
|
"learning_rate": 9.866330768241983e-07,
|
|
"loss": -1.8915,
|
|
"reward": 0.4000000059604645,
|
|
"reward_std": 0.22963720560073853,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 85
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2450.291748046875,
|
|
"epoch": 0.04914285714285714,
|
|
"grad_norm": 0.9087566137313843,
|
|
"kl": 0.014007568359375,
|
|
"learning_rate": 9.85862422507884e-07,
|
|
"loss": -1.8448,
|
|
"reward": 0.8500000536441803,
|
|
"reward_std": 0.464758038520813,
|
|
"rewards/accuracy_reward": 0.5833333432674408,
|
|
"rewards/format_reward": 0.8333333730697632,
|
|
"step": 86
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2870.5,
|
|
"epoch": 0.04971428571428571,
|
|
"grad_norm": 0.9272903800010681,
|
|
"kl": 0.01702880859375,
|
|
"learning_rate": 9.850705248720068e-07,
|
|
"loss": -0.4444,
|
|
"reward": 0.32500001415610313,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 87
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2377.8333740234375,
|
|
"epoch": 0.05028571428571429,
|
|
"grad_norm": 0.9736530184745789,
|
|
"kl": 0.013763427734375,
|
|
"learning_rate": 9.8425742251254e-07,
|
|
"loss": -1.3592,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.30921074748039246,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.6250000298023224,
|
|
"step": 88
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2418.791748046875,
|
|
"epoch": 0.05085714285714286,
|
|
"grad_norm": 0.9435445666313171,
|
|
"kl": 0.023223876953125,
|
|
"learning_rate": 9.83423155058946e-07,
|
|
"loss": -1.928,
|
|
"reward": 0.3500000238418579,
|
|
"reward_std": 0.26902148127555847,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 89
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2293.8751220703125,
|
|
"epoch": 0.05142857142857143,
|
|
"grad_norm": 1.0847179889678955,
|
|
"kl": 0.0203857421875,
|
|
"learning_rate": 9.825677631722435e-07,
|
|
"loss": -1.8569,
|
|
"reward": 0.8250000774860382,
|
|
"reward_std": 0.42503853142261505,
|
|
"rewards/accuracy_reward": 0.5416666865348816,
|
|
"rewards/format_reward": 0.8333333730697632,
|
|
"step": 90
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3049.3751220703125,
|
|
"epoch": 0.052,
|
|
"grad_norm": 0.5584085583686829,
|
|
"kl": 0.016754150390625,
|
|
"learning_rate": 9.816912885430258e-07,
|
|
"loss": -0.8024,
|
|
"reward": 0.40000002086162567,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 91
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2802.0,
|
|
"epoch": 0.052571428571428575,
|
|
"grad_norm": 0.9023773670196533,
|
|
"kl": 0.0162353515625,
|
|
"learning_rate": 9.807937738894303e-07,
|
|
"loss": 0.0026,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 92
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2122.5,
|
|
"epoch": 0.053142857142857144,
|
|
"grad_norm": 1.0342284440994263,
|
|
"kl": 0.016387939453125,
|
|
"learning_rate": 9.798752629550546e-07,
|
|
"loss": 0.0026,
|
|
"reward": 0.42500004172325134,
|
|
"reward_std": 0.13869690895080566,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 93
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2639.2501220703125,
|
|
"epoch": 0.053714285714285714,
|
|
"grad_norm": 0.929490864276886,
|
|
"kl": 0.014007568359375,
|
|
"learning_rate": 9.78935800506826e-07,
|
|
"loss": -2.0397,
|
|
"reward": 0.5250000059604645,
|
|
"reward_std": 0.2479735016822815,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.7083333730697632,
|
|
"step": 94
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1750.0833740234375,
|
|
"epoch": 0.054285714285714284,
|
|
"grad_norm": 1.3795406818389893,
|
|
"kl": 0.01446533203125,
|
|
"learning_rate": 9.779754323328192e-07,
|
|
"loss": -2.9239,
|
|
"reward": 0.675000011920929,
|
|
"reward_std": 0.39452049136161804,
|
|
"rewards/accuracy_reward": 0.291666679084301,
|
|
"rewards/format_reward": 0.8333333730697632,
|
|
"step": 95
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3076.70849609375,
|
|
"epoch": 0.054857142857142854,
|
|
"grad_norm": 0.8867998123168945,
|
|
"kl": 0.01177978515625,
|
|
"learning_rate": 9.769942052400235e-07,
|
|
"loss": -1.3267,
|
|
"reward": 0.2500000223517418,
|
|
"reward_std": 0.24494898319244385,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.3333333544433117,
|
|
"step": 96
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2068.1250610351562,
|
|
"epoch": 0.05542857142857143,
|
|
"grad_norm": 1.0344054698944092,
|
|
"kl": 0.0184326171875,
|
|
"learning_rate": 9.759921670520634e-07,
|
|
"loss": -0.6738,
|
|
"reward": 0.3749999962747097,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.5416666679084301,
|
|
"step": 97
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1832.9584350585938,
|
|
"epoch": 0.056,
|
|
"grad_norm": 0.7039546966552734,
|
|
"kl": 0.0093536376953125,
|
|
"learning_rate": 9.749693666068663e-07,
|
|
"loss": -1.7297,
|
|
"reward": 0.7250000536441803,
|
|
"reward_std": 0.31179559230804443,
|
|
"rewards/accuracy_reward": 0.4583333432674408,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 98
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2053.041748046875,
|
|
"epoch": 0.05657142857142857,
|
|
"grad_norm": 1.1954854726791382,
|
|
"kl": 0.008026123046875,
|
|
"learning_rate": 9.739258537542835e-07,
|
|
"loss": -1.1949,
|
|
"reward": 0.6500000357627869,
|
|
"reward_std": 0.3548534959554672,
|
|
"rewards/accuracy_reward": 0.4583333432674408,
|
|
"rewards/format_reward": 0.6250000298023224,
|
|
"step": 99
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1900.1250610351562,
|
|
"epoch": 0.05714285714285714,
|
|
"grad_norm": 0.692438542842865,
|
|
"kl": 0.008209228515625,
|
|
"learning_rate": 9.728616793536587e-07,
|
|
"loss": -0.8441,
|
|
"reward": 0.6750000715255737,
|
|
"reward_std": 0.26995329558849335,
|
|
"rewards/accuracy_reward": 0.5000000298023224,
|
|
"rewards/format_reward": 0.625,
|
|
"step": 100
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2909.0833740234375,
|
|
"epoch": 0.05771428571428571,
|
|
"grad_norm": 0.7268219590187073,
|
|
"kl": 0.0091552734375,
|
|
"learning_rate": 9.717768952713511e-07,
|
|
"loss": -1.6243,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.2323790192604065,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 101
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2342.25,
|
|
"epoch": 0.05828571428571429,
|
|
"grad_norm": 1.311353087425232,
|
|
"kl": 0.0206298828125,
|
|
"learning_rate": 9.706715543782064e-07,
|
|
"loss": -2.3993,
|
|
"reward": 0.42500004172325134,
|
|
"reward_std": 0.3704479932785034,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 102
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2729.7916870117188,
|
|
"epoch": 0.05885714285714286,
|
|
"grad_norm": 1.082575798034668,
|
|
"kl": 0.0152587890625,
|
|
"learning_rate": 9.695457105469804e-07,
|
|
"loss": -2.3919,
|
|
"reward": 0.2500000111758709,
|
|
"reward_std": 0.2861757278442383,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 103
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2986.5833740234375,
|
|
"epoch": 0.05942857142857143,
|
|
"grad_norm": 0.5590086579322815,
|
|
"kl": 0.011871337890625,
|
|
"learning_rate": 9.683994186497132e-07,
|
|
"loss": -0.5574,
|
|
"reward": 0.30000003427267075,
|
|
"reward_std": 0.19993416219949722,
|
|
"rewards/accuracy_reward": 0.2083333395421505,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 104
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2627.33349609375,
|
|
"epoch": 0.06,
|
|
"grad_norm": 0.7722799777984619,
|
|
"kl": 0.02337646484375,
|
|
"learning_rate": 9.672327345550543e-07,
|
|
"loss": 0.0037,
|
|
"reward": 0.32500001788139343,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 105
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2862.666748046875,
|
|
"epoch": 0.060571428571428575,
|
|
"grad_norm": 0.46922969818115234,
|
|
"kl": 0.009735107421875,
|
|
"learning_rate": 9.66045715125541e-07,
|
|
"loss": -0.7062,
|
|
"reward": 0.27500003576278687,
|
|
"reward_std": 0.20463287830352783,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.2916666865348816,
|
|
"step": 106
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2396.3750610351562,
|
|
"epoch": 0.061142857142857145,
|
|
"grad_norm": 0.8894286155700684,
|
|
"kl": 0.01702880859375,
|
|
"learning_rate": 9.648384182148252e-07,
|
|
"loss": -1.4031,
|
|
"reward": 0.3499999940395355,
|
|
"reward_std": 0.1741531491279602,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.5000000298023224,
|
|
"step": 107
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2107.0833740234375,
|
|
"epoch": 0.061714285714285715,
|
|
"grad_norm": 0.5590862035751343,
|
|
"kl": 0.0107421875,
|
|
"learning_rate": 9.636109026648554e-07,
|
|
"loss": -0.6467,
|
|
"reward": 0.32500000298023224,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 108
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3016.291748046875,
|
|
"epoch": 0.062285714285714285,
|
|
"grad_norm": 0.4933023750782013,
|
|
"kl": 0.0123291015625,
|
|
"learning_rate": 9.623632283030077e-07,
|
|
"loss": 0.002,
|
|
"reward": 0.17499999701976776,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 109
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2778.0001220703125,
|
|
"epoch": 0.06285714285714286,
|
|
"grad_norm": 0.7055474519729614,
|
|
"kl": 0.024871826171875,
|
|
"learning_rate": 9.610954559391704e-07,
|
|
"loss": -0.7942,
|
|
"reward": 0.2500000223517418,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.3333333358168602,
|
|
"step": 110
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2845.041748046875,
|
|
"epoch": 0.06342857142857143,
|
|
"grad_norm": 2.253298044204712,
|
|
"kl": 0.01910400390625,
|
|
"learning_rate": 9.598076473627796e-07,
|
|
"loss": 0.003,
|
|
"reward": 0.2250000238418579,
|
|
"reward_std": 0.08215838670730591,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 111
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2531.916748046875,
|
|
"epoch": 0.064,
|
|
"grad_norm": 0.9246645569801331,
|
|
"kl": 0.015045166015625,
|
|
"learning_rate": 9.58499865339809e-07,
|
|
"loss": -2.463,
|
|
"reward": 0.3750000298023224,
|
|
"reward_std": 0.28209254145622253,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.5000000298023224,
|
|
"step": 112
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2310.4583740234375,
|
|
"epoch": 0.06457142857142857,
|
|
"grad_norm": 0.46783214807510376,
|
|
"kl": 0.01202392578125,
|
|
"learning_rate": 9.571721736097088e-07,
|
|
"loss": -0.7241,
|
|
"reward": 0.3750000149011612,
|
|
"reward_std": 0.17702671885490417,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 113
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3185.666748046875,
|
|
"epoch": 0.06514285714285714,
|
|
"grad_norm": 0.66249018907547,
|
|
"kl": 0.01556396484375,
|
|
"learning_rate": 9.55824636882301e-07,
|
|
"loss": -0.9383,
|
|
"reward": 0.10000000894069672,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.1666666716337204,
|
|
"step": 114
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2467.2501220703125,
|
|
"epoch": 0.06571428571428571,
|
|
"grad_norm": 0.9964971542358398,
|
|
"kl": 0.0130615234375,
|
|
"learning_rate": 9.54457320834625e-07,
|
|
"loss": -2.1875,
|
|
"reward": 0.32500001788139343,
|
|
"reward_std": 0.2611714005470276,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 115
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2428.5416870117188,
|
|
"epoch": 0.06628571428571428,
|
|
"grad_norm": 0.472149521112442,
|
|
"kl": 0.01055908203125,
|
|
"learning_rate": 9.530702921077358e-07,
|
|
"loss": 0.0017,
|
|
"reward": 0.32500001788139343,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 116
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2962.041748046875,
|
|
"epoch": 0.06685714285714285,
|
|
"grad_norm": 1.044438362121582,
|
|
"kl": 0.0174560546875,
|
|
"learning_rate": 9.516636183034564e-07,
|
|
"loss": -2.2953,
|
|
"reward": 0.5999999940395355,
|
|
"reward_std": 0.44171059131622314,
|
|
"rewards/accuracy_reward": 0.5000000298023224,
|
|
"rewards/format_reward": 0.5000000298023224,
|
|
"step": 117
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1725.3333740234375,
|
|
"epoch": 0.06742857142857143,
|
|
"grad_norm": 0.6704514622688293,
|
|
"kl": 0.013427734375,
|
|
"learning_rate": 9.502373679810839e-07,
|
|
"loss": -0.7163,
|
|
"reward": 0.6250000298023224,
|
|
"reward_std": 0.11291590332984924,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 118
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2067.8333740234375,
|
|
"epoch": 0.068,
|
|
"grad_norm": 0.798595666885376,
|
|
"kl": 0.014404296875,
|
|
"learning_rate": 9.487916106540465e-07,
|
|
"loss": 0.0023,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.5,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 119
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2834.75,
|
|
"epoch": 0.06857142857142857,
|
|
"grad_norm": 0.5755884051322937,
|
|
"kl": 0.01953125,
|
|
"learning_rate": 9.473264167865171e-07,
|
|
"loss": -0.8146,
|
|
"reward": 0.25,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 120
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2840.291748046875,
|
|
"epoch": 0.06914285714285714,
|
|
"grad_norm": 0.5671223402023315,
|
|
"kl": 0.018829345703125,
|
|
"learning_rate": 9.458418577899774e-07,
|
|
"loss": -0.7081,
|
|
"reward": 0.2750000022351742,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 121
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1911.1251220703125,
|
|
"epoch": 0.06971428571428571,
|
|
"grad_norm": 0.6135214567184448,
|
|
"kl": 0.014678955078125,
|
|
"learning_rate": 9.443380060197385e-07,
|
|
"loss": 0.0024,
|
|
"reward": 0.7750000357627869,
|
|
"reward_std": 0.13869690895080566,
|
|
"rewards/accuracy_reward": 0.5416666865348816,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 122
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2615.7083740234375,
|
|
"epoch": 0.07028571428571428,
|
|
"grad_norm": 1.535902976989746,
|
|
"kl": 0.019317626953125,
|
|
"learning_rate": 9.428149347714143e-07,
|
|
"loss": -3.4492,
|
|
"reward": 0.5000000447034836,
|
|
"reward_std": 0.49277445673942566,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.5000000298023224,
|
|
"step": 123
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2879.6251220703125,
|
|
"epoch": 0.07085714285714285,
|
|
"grad_norm": 1.1739275455474854,
|
|
"kl": 0.016326904296875,
|
|
"learning_rate": 9.412727182773486e-07,
|
|
"loss": -2.466,
|
|
"reward": 0.5,
|
|
"reward_std": 0.22085529565811157,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.5833333432674408,
|
|
"step": 124
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1907.7916870117188,
|
|
"epoch": 0.07142857142857142,
|
|
"grad_norm": 0.5758523344993591,
|
|
"kl": 0.016204833984375,
|
|
"learning_rate": 9.397114317029974e-07,
|
|
"loss": -0.8541,
|
|
"reward": 0.5750000476837158,
|
|
"reward_std": 0.19540132582187653,
|
|
"rewards/accuracy_reward": 0.291666679084301,
|
|
"rewards/format_reward": 0.6666666716337204,
|
|
"step": 125
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1292.2916870117188,
|
|
"epoch": 0.072,
|
|
"grad_norm": 0.6998382806777954,
|
|
"kl": 0.009063720703125,
|
|
"learning_rate": 9.381311511432658e-07,
|
|
"loss": -0.7275,
|
|
"reward": 0.7500000298023224,
|
|
"reward_std": 0.16431677341461182,
|
|
"rewards/accuracy_reward": 0.375,
|
|
"rewards/format_reward": 0.875,
|
|
"step": 126
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2668.916748046875,
|
|
"epoch": 0.07257142857142856,
|
|
"grad_norm": 0.6192981004714966,
|
|
"kl": 0.0121612548828125,
|
|
"learning_rate": 9.36531953618799e-07,
|
|
"loss": -0.9637,
|
|
"reward": 0.3750000149011612,
|
|
"reward_std": 0.08215838670730591,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 127
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2330.33349609375,
|
|
"epoch": 0.07314285714285715,
|
|
"grad_norm": 0.850170373916626,
|
|
"kl": 0.0172271728515625,
|
|
"learning_rate": 9.34913917072228e-07,
|
|
"loss": -1.662,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.1741531491279602,
|
|
"rewards/accuracy_reward": 0.2916666865348816,
|
|
"rewards/format_reward": 0.6250000298023224,
|
|
"step": 128
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1855.9166870117188,
|
|
"epoch": 0.07371428571428572,
|
|
"grad_norm": 0.6127024292945862,
|
|
"kl": 0.013641357421875,
|
|
"learning_rate": 9.332771203643714e-07,
|
|
"loss": -0.7422,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 129
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1596.4583740234375,
|
|
"epoch": 0.07428571428571429,
|
|
"grad_norm": 0.6304543614387512,
|
|
"kl": 0.01348876953125,
|
|
"learning_rate": 9.316216432703916e-07,
|
|
"loss": 0.0022,
|
|
"reward": 0.6500000655651093,
|
|
"reward_std": 0.24177644401788712,
|
|
"rewards/accuracy_reward": 0.3333333358168602,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 130
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2883.291748046875,
|
|
"epoch": 0.07485714285714286,
|
|
"grad_norm": 0.4680456817150116,
|
|
"kl": 0.01495361328125,
|
|
"learning_rate": 9.299475664759068e-07,
|
|
"loss": 0.0024,
|
|
"reward": 0.15000000596046448,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 131
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2198.375,
|
|
"epoch": 0.07542857142857143,
|
|
"grad_norm": 0.583177924156189,
|
|
"kl": 0.013580322265625,
|
|
"learning_rate": 9.282549715730579e-07,
|
|
"loss": -1.4613,
|
|
"reward": 0.5750000476837158,
|
|
"reward_std": 0.15610557794570923,
|
|
"rewards/accuracy_reward": 0.2916666679084301,
|
|
"rewards/format_reward": 0.6666666865348816,
|
|
"step": 132
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2586.6666870117188,
|
|
"epoch": 0.076,
|
|
"grad_norm": 0.4416232705116272,
|
|
"kl": 0.0152587890625,
|
|
"learning_rate": 9.265439410565328e-07,
|
|
"loss": -0.7425,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.20463287830352783,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 133
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2783.4583740234375,
|
|
"epoch": 0.07657142857142857,
|
|
"grad_norm": 0.8242610096931458,
|
|
"kl": 0.01690673828125,
|
|
"learning_rate": 9.248145583195447e-07,
|
|
"loss": -1.5028,
|
|
"reward": 0.3500000163912773,
|
|
"reward_std": 0.24494898319244385,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.3750000223517418,
|
|
"step": 134
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2785.666748046875,
|
|
"epoch": 0.07714285714285714,
|
|
"grad_norm": 0.8124563097953796,
|
|
"kl": 0.02239990234375,
|
|
"learning_rate": 9.230669076497687e-07,
|
|
"loss": -0.6555,
|
|
"reward": 0.19999999925494194,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 135
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1172.4583740234375,
|
|
"epoch": 0.07771428571428571,
|
|
"grad_norm": 1.0723152160644531,
|
|
"kl": 0.018798828125,
|
|
"learning_rate": 9.213010742252327e-07,
|
|
"loss": -0.9192,
|
|
"reward": 0.675000011920929,
|
|
"reward_std": 0.2611714005470276,
|
|
"rewards/accuracy_reward": 0.2500000111758709,
|
|
"rewards/format_reward": 0.875,
|
|
"step": 136
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2302.041748046875,
|
|
"epoch": 0.07828571428571429,
|
|
"grad_norm": 0.9255122542381287,
|
|
"kl": 0.0070953369140625,
|
|
"learning_rate": 9.195171441101668e-07,
|
|
"loss": -2.5899,
|
|
"reward": 0.42500001192092896,
|
|
"reward_std": 0.4010545611381531,
|
|
"rewards/accuracy_reward": 0.2083333395421505,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 137
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2176.9583740234375,
|
|
"epoch": 0.07885714285714286,
|
|
"grad_norm": 0.5417965650558472,
|
|
"kl": 0.016082763671875,
|
|
"learning_rate": 9.177152042508077e-07,
|
|
"loss": 0.0026,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 138
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2479.8333740234375,
|
|
"epoch": 0.07942857142857143,
|
|
"grad_norm": 0.8476680517196655,
|
|
"kl": 0.019317626953125,
|
|
"learning_rate": 9.158953424711624e-07,
|
|
"loss": -1.9513,
|
|
"reward": 0.30000003799796104,
|
|
"reward_std": 0.2173428237438202,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.4583333544433117,
|
|
"step": 139
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1950.8334350585938,
|
|
"epoch": 0.08,
|
|
"grad_norm": 0.32817330956459045,
|
|
"kl": 0.0100860595703125,
|
|
"learning_rate": 9.140576474687263e-07,
|
|
"loss": -0.7432,
|
|
"reward": 0.5750000178813934,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 140
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1543.7500610351562,
|
|
"epoch": 0.08057142857142857,
|
|
"grad_norm": 0.5753952264785767,
|
|
"kl": 0.01080322265625,
|
|
"learning_rate": 9.122022088101613e-07,
|
|
"loss": 0.0017,
|
|
"reward": 0.4750000089406967,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 141
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2146.2083740234375,
|
|
"epoch": 0.08114285714285714,
|
|
"grad_norm": 0.7607588171958923,
|
|
"kl": 0.01971435546875,
|
|
"learning_rate": 9.103291169269299e-07,
|
|
"loss": -1.4222,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.280418336391449,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 142
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2653.3751220703125,
|
|
"epoch": 0.08171428571428571,
|
|
"grad_norm": 1.6966173648834229,
|
|
"kl": 0.0238037109375,
|
|
"learning_rate": 9.084384631108882e-07,
|
|
"loss": -1.9226,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.2773938328027725,
|
|
"rewards/accuracy_reward": 0.3333333358168602,
|
|
"rewards/format_reward": 0.4166666865348816,
|
|
"step": 143
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1880.166748046875,
|
|
"epoch": 0.08228571428571428,
|
|
"grad_norm": 1.1052340269088745,
|
|
"kl": 0.018768310546875,
|
|
"learning_rate": 9.065303395098358e-07,
|
|
"loss": -0.9172,
|
|
"reward": 0.7750000357627869,
|
|
"reward_std": 0.22493848204612732,
|
|
"rewards/accuracy_reward": 0.4166666716337204,
|
|
"rewards/format_reward": 0.875,
|
|
"step": 144
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 886.25,
|
|
"epoch": 0.08285714285714285,
|
|
"grad_norm": 0.5123775601387024,
|
|
"kl": 0.016632080078125,
|
|
"learning_rate": 9.046048391230247e-07,
|
|
"loss": 0.0027,
|
|
"reward": 0.6500000059604645,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 1.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1539.8333435058594,
|
|
"epoch": 0.08342857142857144,
|
|
"grad_norm": 0.5727400779724121,
|
|
"kl": 0.01708984375,
|
|
"learning_rate": 9.026620557966279e-07,
|
|
"loss": 0.0027,
|
|
"reward": 0.6500000357627869,
|
|
"reward_std": 0.22085530310869217,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 146
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1381.3750610351562,
|
|
"epoch": 0.084,
|
|
"grad_norm": 0.8086219429969788,
|
|
"kl": 0.01629638671875,
|
|
"learning_rate": 9.007020842191634e-07,
|
|
"loss": -1.4881,
|
|
"reward": 0.7750000357627869,
|
|
"reward_std": 0.31787581741809845,
|
|
"rewards/accuracy_reward": 0.4166666865348816,
|
|
"rewards/format_reward": 0.8750000298023224,
|
|
"step": 147
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1451.5,
|
|
"epoch": 0.08457142857142858,
|
|
"grad_norm": 0.8584139347076416,
|
|
"kl": 0.01898193359375,
|
|
"learning_rate": 8.987250199168808e-07,
|
|
"loss": -0.9353,
|
|
"reward": 0.550000011920929,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.8333333432674408,
|
|
"step": 148
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1420.4583740234375,
|
|
"epoch": 0.08514285714285715,
|
|
"grad_norm": 0.8129308223724365,
|
|
"kl": 0.014434814453125,
|
|
"learning_rate": 8.967309592491052e-07,
|
|
"loss": -1.3771,
|
|
"reward": 0.6000000536441803,
|
|
"reward_std": 0.19993415474891663,
|
|
"rewards/accuracy_reward": 0.2500000111758709,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 149
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2162.666748046875,
|
|
"epoch": 0.08571428571428572,
|
|
"grad_norm": 0.889790952205658,
|
|
"kl": 0.02252197265625,
|
|
"learning_rate": 8.9471999940354e-07,
|
|
"loss": -0.58,
|
|
"reward": 0.42500004172325134,
|
|
"reward_std": 0.20463287830352783,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 150
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2218.7916870117188,
|
|
"epoch": 0.08628571428571429,
|
|
"grad_norm": 0.5901851654052734,
|
|
"kl": 0.013671875,
|
|
"learning_rate": 8.926922383915315e-07,
|
|
"loss": -0.7892,
|
|
"reward": 0.5500000268220901,
|
|
"reward_std": 0.22085529565811157,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.5833333432674408,
|
|
"step": 151
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2912.916748046875,
|
|
"epoch": 0.08685714285714285,
|
|
"grad_norm": 0.5517586469650269,
|
|
"kl": 0.021484375,
|
|
"learning_rate": 8.906477750432903e-07,
|
|
"loss": -0.891,
|
|
"reward": 0.3250000327825546,
|
|
"reward_std": 0.1596180573105812,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 152
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2069.916748046875,
|
|
"epoch": 0.08742857142857142,
|
|
"grad_norm": 0.6771623492240906,
|
|
"kl": 0.015380859375,
|
|
"learning_rate": 8.88586709003076e-07,
|
|
"loss": -1.2495,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.15610557794570923,
|
|
"rewards/accuracy_reward": 0.2916666679084301,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 153
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2867.5833740234375,
|
|
"epoch": 0.088,
|
|
"grad_norm": 0.6926563382148743,
|
|
"kl": 0.017333984375,
|
|
"learning_rate": 8.865091407243394e-07,
|
|
"loss": -1.5406,
|
|
"reward": 0.27500002086162567,
|
|
"reward_std": 0.2479735016822815,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.291666679084301,
|
|
"step": 154
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3541.0833740234375,
|
|
"epoch": 0.08857142857142856,
|
|
"grad_norm": 0.6008081436157227,
|
|
"kl": 0.01416015625,
|
|
"learning_rate": 8.844151714648274e-07,
|
|
"loss": -0.6494,
|
|
"reward": 0.07500000298023224,
|
|
"reward_std": 0.12549901008605957,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.0416666679084301,
|
|
"step": 155
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1990.041748046875,
|
|
"epoch": 0.08914285714285715,
|
|
"grad_norm": 1.1144965887069702,
|
|
"kl": 0.01812744140625,
|
|
"learning_rate": 8.823049032816478e-07,
|
|
"loss": -2.2955,
|
|
"reward": 0.675000011920929,
|
|
"reward_std": 0.3254331648349762,
|
|
"rewards/accuracy_reward": 0.3750000149011612,
|
|
"rewards/format_reward": 0.7500000298023224,
|
|
"step": 156
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2252.8333740234375,
|
|
"epoch": 0.08971428571428572,
|
|
"grad_norm": 0.5644240379333496,
|
|
"kl": 0.012451171875,
|
|
"learning_rate": 8.801784390262943e-07,
|
|
"loss": 0.002,
|
|
"reward": 0.3750000298023224,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 157
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1755.791748046875,
|
|
"epoch": 0.09028571428571429,
|
|
"grad_norm": 0.8909981846809387,
|
|
"kl": 0.0125732421875,
|
|
"learning_rate": 8.780358823396352e-07,
|
|
"loss": -1.3986,
|
|
"reward": 0.7000000476837158,
|
|
"reward_std": 0.38667041063308716,
|
|
"rewards/accuracy_reward": 0.3750000223517418,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 158
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3373.541748046875,
|
|
"epoch": 0.09085714285714286,
|
|
"grad_norm": 0.568712055683136,
|
|
"kl": 0.016937255859375,
|
|
"learning_rate": 8.758773376468604e-07,
|
|
"loss": -0.678,
|
|
"reward": 0.05000000447034836,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.0833333358168602,
|
|
"step": 159
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3461.0833740234375,
|
|
"epoch": 0.09142857142857143,
|
|
"grad_norm": 0.5345960855484009,
|
|
"kl": 0.01690673828125,
|
|
"learning_rate": 8.737029101523929e-07,
|
|
"loss": -1.3097,
|
|
"reward": 0.07500000298023224,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.0833333358168602,
|
|
"step": 160
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2223.291748046875,
|
|
"epoch": 0.092,
|
|
"grad_norm": 0.8153612017631531,
|
|
"kl": 0.0274658203125,
|
|
"learning_rate": 8.715127058347614e-07,
|
|
"loss": -0.9097,
|
|
"reward": 0.6000000089406967,
|
|
"reward_std": 0.32863354682922363,
|
|
"rewards/accuracy_reward": 0.375,
|
|
"rewards/format_reward": 0.625,
|
|
"step": 161
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2307.875,
|
|
"epoch": 0.09257142857142857,
|
|
"grad_norm": 0.5529870986938477,
|
|
"kl": 0.013397216796875,
|
|
"learning_rate": 8.693068314414344e-07,
|
|
"loss": -1.2837,
|
|
"reward": 0.5000000447034836,
|
|
"reward_std": 0.2773938328027725,
|
|
"rewards/accuracy_reward": 0.3750000149011612,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 162
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2981.0,
|
|
"epoch": 0.09314285714285714,
|
|
"grad_norm": 0.8973040580749512,
|
|
"kl": 0.0225830078125,
|
|
"learning_rate": 8.670853944836176e-07,
|
|
"loss": -1.6263,
|
|
"reward": 0.20000001788139343,
|
|
"reward_std": 0.21162375807762146,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 163
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2735.875,
|
|
"epoch": 0.09371428571428571,
|
|
"grad_norm": 1.581978678703308,
|
|
"kl": 0.016082763671875,
|
|
"learning_rate": 8.648485032310144e-07,
|
|
"loss": -2.4628,
|
|
"reward": 0.42500001192092896,
|
|
"reward_std": 0.35955221951007843,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.5833333432674408,
|
|
"step": 164
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3489.4583740234375,
|
|
"epoch": 0.09428571428571429,
|
|
"grad_norm": 1.0414494276046753,
|
|
"kl": 0.0177001953125,
|
|
"learning_rate": 8.625962667065487e-07,
|
|
"loss": -2.4063,
|
|
"reward": 0.15000000223517418,
|
|
"reward_std": 0.26419591903686523,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.2083333395421505,
|
|
"step": 165
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3584.0,
|
|
"epoch": 0.09485714285714286,
|
|
"grad_norm": 0.3711743652820587,
|
|
"kl": 0.015045166015625,
|
|
"learning_rate": 8.603287946810513e-07,
|
|
"loss": -0.6281,
|
|
"reward": 0.05000000447034836,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.0416666679084301,
|
|
"step": 166
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1534.9584350585938,
|
|
"epoch": 0.09542857142857143,
|
|
"grad_norm": 0.7632970809936523,
|
|
"kl": 0.02166748046875,
|
|
"learning_rate": 8.580461976679099e-07,
|
|
"loss": -0.8458,
|
|
"reward": 0.8250000774860382,
|
|
"reward_std": 0.29361626505851746,
|
|
"rewards/accuracy_reward": 0.4583333432674408,
|
|
"rewards/format_reward": 0.9166666865348816,
|
|
"step": 167
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2715.8333740234375,
|
|
"epoch": 0.096,
|
|
"grad_norm": 0.774446427822113,
|
|
"kl": 0.02288818359375,
|
|
"learning_rate": 8.557485869176825e-07,
|
|
"loss": -1.9958,
|
|
"reward": 0.40000003576278687,
|
|
"reward_std": 0.29662763327360153,
|
|
"rewards/accuracy_reward": 0.2083333358168602,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 168
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1717.4166870117188,
|
|
"epoch": 0.09657142857142857,
|
|
"grad_norm": 0.6996132135391235,
|
|
"kl": 0.01715087890625,
|
|
"learning_rate": 8.534360744126753e-07,
|
|
"loss": -1.6658,
|
|
"reward": 0.5500000268220901,
|
|
"reward_std": 0.29662764072418213,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 169
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3413.9583740234375,
|
|
"epoch": 0.09714285714285714,
|
|
"grad_norm": 0.6789911389350891,
|
|
"kl": 0.02880859375,
|
|
"learning_rate": 8.511087728614862e-07,
|
|
"loss": -1.5654,
|
|
"reward": 0.1250000111758709,
|
|
"reward_std": 0.2611714005470276,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.1666666679084301,
|
|
"step": 170
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2726.70849609375,
|
|
"epoch": 0.09771428571428571,
|
|
"grad_norm": 0.3861570954322815,
|
|
"kl": 0.01446533203125,
|
|
"learning_rate": 8.487667956935087e-07,
|
|
"loss": 0.0023,
|
|
"reward": 0.4000000059604645,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 171
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2290.6251220703125,
|
|
"epoch": 0.09828571428571428,
|
|
"grad_norm": 0.9113690257072449,
|
|
"kl": 0.0257568359375,
|
|
"learning_rate": 8.464102570534061e-07,
|
|
"loss": -0.7408,
|
|
"reward": 0.2750000059604645,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 172
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1820.666748046875,
|
|
"epoch": 0.09885714285714285,
|
|
"grad_norm": 0.9667136073112488,
|
|
"kl": 0.02313232421875,
|
|
"learning_rate": 8.440392717955475e-07,
|
|
"loss": -1.6142,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.26889464259147644,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 173
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3118.0,
|
|
"epoch": 0.09942857142857142,
|
|
"grad_norm": 0.8733185529708862,
|
|
"kl": 0.0316162109375,
|
|
"learning_rate": 8.416539554784089e-07,
|
|
"loss": -1.6321,
|
|
"reward": 0.1250000111758709,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.2083333395421505,
|
|
"step": 174
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1706.4583435058594,
|
|
"epoch": 0.1,
|
|
"grad_norm": 0.45300906896591187,
|
|
"kl": 0.012939453125,
|
|
"learning_rate": 8.392544243589427e-07,
|
|
"loss": 0.0021,
|
|
"reward": 0.8000000566244125,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.6250000149011612,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 175
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1727.5417175292969,
|
|
"epoch": 0.10057142857142858,
|
|
"grad_norm": 0.659142255783081,
|
|
"kl": 0.021392822265625,
|
|
"learning_rate": 8.368407953869103e-07,
|
|
"loss": 0.0034,
|
|
"reward": 0.6000000536441803,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.2916666679084301,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 176
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2159.25,
|
|
"epoch": 0.10114285714285715,
|
|
"grad_norm": 0.8017681837081909,
|
|
"kl": 0.013885498046875,
|
|
"learning_rate": 8.344131861991828e-07,
|
|
"loss": 0.0022,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.13869690895080566,
|
|
"rewards/accuracy_reward": 0.2916666865348816,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 177
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3375.9583740234375,
|
|
"epoch": 0.10171428571428572,
|
|
"grad_norm": 1.2313588857650757,
|
|
"kl": 0.015167236328125,
|
|
"learning_rate": 8.319717151140072e-07,
|
|
"loss": -2.2319,
|
|
"reward": 0.17500000447034836,
|
|
"reward_std": 0.3254331722855568,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.2083333358168602,
|
|
"step": 178
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3116.625,
|
|
"epoch": 0.10228571428571429,
|
|
"grad_norm": 0.6798207759857178,
|
|
"kl": 0.02423095703125,
|
|
"learning_rate": 8.295165011252396e-07,
|
|
"loss": -1.4557,
|
|
"reward": 0.2750000059604645,
|
|
"reward_std": 0.26783522963523865,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.2916666865348816,
|
|
"step": 179
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2302.25,
|
|
"epoch": 0.10285714285714286,
|
|
"grad_norm": 0.5794097781181335,
|
|
"kl": 0.022064208984375,
|
|
"learning_rate": 8.270476638965461e-07,
|
|
"loss": -0.532,
|
|
"reward": 0.5000000298023224,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.3333333358168602,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 180
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2662.5833740234375,
|
|
"epoch": 0.10342857142857143,
|
|
"grad_norm": 0.6577425599098206,
|
|
"kl": 0.019775390625,
|
|
"learning_rate": 8.245653237555705e-07,
|
|
"loss": -1.5816,
|
|
"reward": 0.4500000402331352,
|
|
"reward_std": 0.19993416219949722,
|
|
"rewards/accuracy_reward": 0.2916666679084301,
|
|
"rewards/format_reward": 0.4583333544433117,
|
|
"step": 181
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2937.2501220703125,
|
|
"epoch": 0.104,
|
|
"grad_norm": 0.8629750609397888,
|
|
"kl": 0.02593994140625,
|
|
"learning_rate": 8.220696016880687e-07,
|
|
"loss": -1.5823,
|
|
"reward": 0.15000001341104507,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.2500000074505806,
|
|
"step": 182
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2815.2916870117188,
|
|
"epoch": 0.10457142857142857,
|
|
"grad_norm": 0.6297455430030823,
|
|
"kl": 0.02032470703125,
|
|
"learning_rate": 8.195606193320136e-07,
|
|
"loss": -0.8378,
|
|
"reward": 0.27500003576278687,
|
|
"reward_std": 0.11291590332984924,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.4166666865348816,
|
|
"step": 183
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1781.041748046875,
|
|
"epoch": 0.10514285714285715,
|
|
"grad_norm": 0.5706174969673157,
|
|
"kl": 0.012176513671875,
|
|
"learning_rate": 8.170384989716657e-07,
|
|
"loss": -0.743,
|
|
"reward": 0.7000000476837158,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.4583333432674408,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 184
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1956.2500610351562,
|
|
"epoch": 0.10571428571428572,
|
|
"grad_norm": 0.8781810402870178,
|
|
"kl": 0.0162353515625,
|
|
"learning_rate": 8.145033635316128e-07,
|
|
"loss": -1.4382,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.18973666429519653,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.6666666865348816,
|
|
"step": 185
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2789.2501220703125,
|
|
"epoch": 0.10628571428571429,
|
|
"grad_norm": 0.7274031639099121,
|
|
"kl": 0.01470947265625,
|
|
"learning_rate": 8.119553365707802e-07,
|
|
"loss": -1.4622,
|
|
"reward": 0.4500000476837158,
|
|
"reward_std": 0.36425093561410904,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 186
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3284.791748046875,
|
|
"epoch": 0.10685714285714286,
|
|
"grad_norm": 0.7924415469169617,
|
|
"kl": 0.03204345703125,
|
|
"learning_rate": 8.093945422764069e-07,
|
|
"loss": -1.1243,
|
|
"reward": 0.07500000298023224,
|
|
"reward_std": 0.13869690895080566,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.125,
|
|
"step": 187
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2522.9583740234375,
|
|
"epoch": 0.10742857142857143,
|
|
"grad_norm": 0.8641192317008972,
|
|
"kl": 0.02154541015625,
|
|
"learning_rate": 8.068211054579943e-07,
|
|
"loss": -1.8636,
|
|
"reward": 0.6000000536441803,
|
|
"reward_std": 0.3966957926750183,
|
|
"rewards/accuracy_reward": 0.4583333432674408,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 188
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3382.625,
|
|
"epoch": 0.108,
|
|
"grad_norm": 1.1794185638427734,
|
|
"kl": 0.0162353515625,
|
|
"learning_rate": 8.04235151541222e-07,
|
|
"loss": -2.2379,
|
|
"reward": 0.20000001043081284,
|
|
"reward_std": 0.32240865379571915,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.2500000074505806,
|
|
"step": 189
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2990.625,
|
|
"epoch": 0.10857142857142857,
|
|
"grad_norm": 0.6209798455238342,
|
|
"kl": 0.0179443359375,
|
|
"learning_rate": 8.01636806561836e-07,
|
|
"loss": -0.7416,
|
|
"reward": 0.125,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.2083333432674408,
|
|
"step": 190
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2806.8333740234375,
|
|
"epoch": 0.10914285714285714,
|
|
"grad_norm": 1.071854591369629,
|
|
"kl": 0.03564453125,
|
|
"learning_rate": 7.990261971595048e-07,
|
|
"loss": -1.7873,
|
|
"reward": 0.25,
|
|
"reward_std": 0.21162375807762146,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 191
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2162.95849609375,
|
|
"epoch": 0.10971428571428571,
|
|
"grad_norm": 0.6578007340431213,
|
|
"kl": 0.02581787109375,
|
|
"learning_rate": 7.964034505716476e-07,
|
|
"loss": -0.6846,
|
|
"reward": 0.550000011920929,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.375,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 192
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3155.791748046875,
|
|
"epoch": 0.11028571428571429,
|
|
"grad_norm": 0.5719695687294006,
|
|
"kl": 0.0224609375,
|
|
"learning_rate": 7.93768694627233e-07,
|
|
"loss": -1.403,
|
|
"reward": 0.10000000521540642,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.1666666679084301,
|
|
"step": 193
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1952.041748046875,
|
|
"epoch": 0.11085714285714286,
|
|
"grad_norm": 1.1899938583374023,
|
|
"kl": 0.037109375,
|
|
"learning_rate": 7.911220577405484e-07,
|
|
"loss": -1.389,
|
|
"reward": 0.6500000357627869,
|
|
"reward_std": 0.37408730387687683,
|
|
"rewards/accuracy_reward": 0.375,
|
|
"rewards/format_reward": 0.7083333730697632,
|
|
"step": 194
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2549.4583740234375,
|
|
"epoch": 0.11142857142857143,
|
|
"grad_norm": 0.7450345754623413,
|
|
"kl": 0.0228271484375,
|
|
"learning_rate": 7.884636689049422e-07,
|
|
"loss": -1.3458,
|
|
"reward": 0.3999999985098839,
|
|
"reward_std": 0.2658701241016388,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.5000000111758709,
|
|
"step": 195
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2487.7916870117188,
|
|
"epoch": 0.112,
|
|
"grad_norm": 0.5231258869171143,
|
|
"kl": 0.01495361328125,
|
|
"learning_rate": 7.857936576865356e-07,
|
|
"loss": -0.9285,
|
|
"reward": 0.375,
|
|
"reward_std": 0.08215838670730591,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 196
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2426.3334350585938,
|
|
"epoch": 0.11257142857142857,
|
|
"grad_norm": 0.8786201477050781,
|
|
"kl": 0.0322265625,
|
|
"learning_rate": 7.831121542179086e-07,
|
|
"loss": -1.3101,
|
|
"reward": 0.42500003799796104,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.4583333544433117,
|
|
"step": 197
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1954.1250610351562,
|
|
"epoch": 0.11314285714285714,
|
|
"grad_norm": 0.8747029900550842,
|
|
"kl": 0.02593994140625,
|
|
"learning_rate": 7.804192891917571e-07,
|
|
"loss": -1.4407,
|
|
"reward": 0.6000000536441803,
|
|
"reward_std": 0.3504374995827675,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.6666666716337204,
|
|
"step": 198
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2655.416748046875,
|
|
"epoch": 0.11371428571428571,
|
|
"grad_norm": 0.8738974928855896,
|
|
"kl": 0.02545166015625,
|
|
"learning_rate": 7.777151938545235e-07,
|
|
"loss": -1.3506,
|
|
"reward": 0.20000000298023224,
|
|
"reward_std": 0.18673625588417053,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.291666679084301,
|
|
"step": 199
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2649.4583740234375,
|
|
"epoch": 0.11428571428571428,
|
|
"grad_norm": 0.7848045229911804,
|
|
"kl": 0.026123046875,
|
|
"learning_rate": 7.75e-07,
|
|
"loss": -1.6035,
|
|
"reward": 0.32500001788139343,
|
|
"reward_std": 0.26995331048965454,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 200
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2324.0833740234375,
|
|
"epoch": 0.11485714285714285,
|
|
"grad_norm": 0.6355379819869995,
|
|
"kl": 0.017059326171875,
|
|
"learning_rate": 7.72273839962904e-07,
|
|
"loss": -0.7416,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 201
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2468.916748046875,
|
|
"epoch": 0.11542857142857142,
|
|
"grad_norm": 0.7434844374656677,
|
|
"kl": 0.017822265625,
|
|
"learning_rate": 7.695368466124296e-07,
|
|
"loss": 0.0029,
|
|
"reward": 0.40000003576278687,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 202
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2735.2916870117188,
|
|
"epoch": 0.116,
|
|
"grad_norm": 0.4409310221672058,
|
|
"kl": 0.01507568359375,
|
|
"learning_rate": 7.667891533457718e-07,
|
|
"loss": -1.4004,
|
|
"reward": 0.30000000447034836,
|
|
"reward_std": 0.2658701241016388,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.4166666679084301,
|
|
"step": 203
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2205.7501220703125,
|
|
"epoch": 0.11657142857142858,
|
|
"grad_norm": 0.5194834470748901,
|
|
"kl": 0.012176513671875,
|
|
"learning_rate": 7.640308940816239e-07,
|
|
"loss": 0.002,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 204
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2431.0,
|
|
"epoch": 0.11714285714285715,
|
|
"grad_norm": 0.5818977355957031,
|
|
"kl": 0.018218994140625,
|
|
"learning_rate": 7.612622032536507e-07,
|
|
"loss": -0.722,
|
|
"reward": 0.375,
|
|
"reward_std": 0.17702671885490417,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 205
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2947.8751220703125,
|
|
"epoch": 0.11771428571428572,
|
|
"grad_norm": 1.0808711051940918,
|
|
"kl": 0.02557373046875,
|
|
"learning_rate": 7.584832158039378e-07,
|
|
"loss": -2.5238,
|
|
"reward": 0.20000001788139343,
|
|
"reward_std": 0.2323790118098259,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 206
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2864.2501220703125,
|
|
"epoch": 0.11828571428571429,
|
|
"grad_norm": 0.9076454639434814,
|
|
"kl": 0.0179443359375,
|
|
"learning_rate": 7.556940671764124e-07,
|
|
"loss": -1.2846,
|
|
"reward": 0.27500002086162567,
|
|
"reward_std": 0.2611714079976082,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.3333333358168602,
|
|
"step": 207
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2238.541748046875,
|
|
"epoch": 0.11885714285714286,
|
|
"grad_norm": 0.5197967886924744,
|
|
"kl": 0.0123291015625,
|
|
"learning_rate": 7.528948933102438e-07,
|
|
"loss": -0.7262,
|
|
"reward": 0.6250000298023224,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.5,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 208
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2626.291748046875,
|
|
"epoch": 0.11942857142857143,
|
|
"grad_norm": 0.5860099196434021,
|
|
"kl": 0.012420654296875,
|
|
"learning_rate": 7.500858306332172e-07,
|
|
"loss": -1.2149,
|
|
"reward": 0.3750000149011612,
|
|
"reward_std": 0.13869690895080566,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 209
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2280.5000610351562,
|
|
"epoch": 0.12,
|
|
"grad_norm": 1.0006970167160034,
|
|
"kl": 0.02508544921875,
|
|
"learning_rate": 7.472670160550848e-07,
|
|
"loss": -1.5695,
|
|
"reward": 0.4000000059604645,
|
|
"reward_std": 0.17232800275087357,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.6250000298023224,
|
|
"step": 210
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1863.8750610351562,
|
|
"epoch": 0.12057142857142857,
|
|
"grad_norm": 0.7675609588623047,
|
|
"kl": 0.013702392578125,
|
|
"learning_rate": 7.444385869608921e-07,
|
|
"loss": -0.7409,
|
|
"reward": 0.6500000357627869,
|
|
"reward_std": 0.2658701241016388,
|
|
"rewards/accuracy_reward": 0.3750000149011612,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 211
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3500.916748046875,
|
|
"epoch": 0.12114285714285715,
|
|
"grad_norm": 1.411064863204956,
|
|
"kl": 0.02862548828125,
|
|
"learning_rate": 7.416006812042827e-07,
|
|
"loss": -2.3867,
|
|
"reward": 0.22500000149011612,
|
|
"reward_std": 0.3480285108089447,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.2083333395421505,
|
|
"step": 212
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1511.2500610351562,
|
|
"epoch": 0.12171428571428572,
|
|
"grad_norm": 0.5711240768432617,
|
|
"kl": 0.01910400390625,
|
|
"learning_rate": 7.387534371007797e-07,
|
|
"loss": 0.0031,
|
|
"reward": 0.5250000059604645,
|
|
"reward_std": 0.13869690895080566,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 213
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2068.9584350585938,
|
|
"epoch": 0.12228571428571429,
|
|
"grad_norm": 0.3741142153739929,
|
|
"kl": 0.0205078125,
|
|
"learning_rate": 7.358969934210438e-07,
|
|
"loss": -0.5851,
|
|
"reward": 0.32500000298023224,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 214
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2977.791748046875,
|
|
"epoch": 0.12285714285714286,
|
|
"grad_norm": 1.0709601640701294,
|
|
"kl": 0.02203369140625,
|
|
"learning_rate": 7.330314893841101e-07,
|
|
"loss": -2.8587,
|
|
"reward": 0.30000002682209015,
|
|
"reward_std": 0.32240864634513855,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 215
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 885.9583435058594,
|
|
"epoch": 0.12342857142857143,
|
|
"grad_norm": 0.8305730223655701,
|
|
"kl": 0.015655517578125,
|
|
"learning_rate": 7.301570646506027e-07,
|
|
"loss": -0.8887,
|
|
"reward": 1.0250000655651093,
|
|
"reward_std": 0.11291590332984924,
|
|
"rewards/accuracy_reward": 0.7916666865348816,
|
|
"rewards/format_reward": 0.9166666865348816,
|
|
"step": 216
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3584.0,
|
|
"epoch": 0.124,
|
|
"grad_norm": 0.4252244532108307,
|
|
"kl": 0.0159912109375,
|
|
"learning_rate": 7.27273859315928e-07,
|
|
"loss": 0.0026,
|
|
"reward": 0.0,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.0,
|
|
"step": 217
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2363.666748046875,
|
|
"epoch": 0.12457142857142857,
|
|
"grad_norm": 0.8345184922218323,
|
|
"kl": 0.0247802734375,
|
|
"learning_rate": 7.243820139034464e-07,
|
|
"loss": -1.5168,
|
|
"reward": 0.40000003576278687,
|
|
"reward_std": 0.2323790118098259,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 218
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 838.2083740234375,
|
|
"epoch": 0.12514285714285714,
|
|
"grad_norm": 1.1001908779144287,
|
|
"kl": 0.01165771484375,
|
|
"learning_rate": 7.214816693576234e-07,
|
|
"loss": 0.0019,
|
|
"reward": 0.9750000536441803,
|
|
"reward_std": 0.13869690895080566,
|
|
"rewards/accuracy_reward": 0.625,
|
|
"rewards/format_reward": 1.0,
|
|
"step": 219
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2691.5833740234375,
|
|
"epoch": 0.12571428571428572,
|
|
"grad_norm": 0.48478028178215027,
|
|
"kl": 0.017822265625,
|
|
"learning_rate": 7.185729670371604e-07,
|
|
"loss": -0.9277,
|
|
"reward": 0.5,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.5833333432674408,
|
|
"step": 220
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1311.3333740234375,
|
|
"epoch": 0.12628571428571428,
|
|
"grad_norm": 0.8523876667022705,
|
|
"kl": 0.015106201171875,
|
|
"learning_rate": 7.156560487081051e-07,
|
|
"loss": 0.0024,
|
|
"reward": 0.7500000298023224,
|
|
"reward_std": 0.16431677341461182,
|
|
"rewards/accuracy_reward": 0.5,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 221
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1624.291748046875,
|
|
"epoch": 0.12685714285714286,
|
|
"grad_norm": 0.6330533623695374,
|
|
"kl": 0.02435302734375,
|
|
"learning_rate": 7.127310565369415e-07,
|
|
"loss": -0.4816,
|
|
"reward": 0.675000011920929,
|
|
"reward_std": 0.3061862289905548,
|
|
"rewards/accuracy_reward": 0.3333333358168602,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 222
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2770.0,
|
|
"epoch": 0.12742857142857142,
|
|
"grad_norm": 0.8254171013832092,
|
|
"kl": 0.0244140625,
|
|
"learning_rate": 7.097981330836616e-07,
|
|
"loss": -1.1907,
|
|
"reward": 0.2500000111758709,
|
|
"reward_std": 0.24494898319244385,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.3333333544433117,
|
|
"step": 223
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2910.3751220703125,
|
|
"epoch": 0.128,
|
|
"grad_norm": 0.570878267288208,
|
|
"kl": 0.0177001953125,
|
|
"learning_rate": 7.068574212948169e-07,
|
|
"loss": -0.8357,
|
|
"reward": 0.4500000476837158,
|
|
"reward_std": 0.2323790118098259,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 224
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2329.0001220703125,
|
|
"epoch": 0.12857142857142856,
|
|
"grad_norm": 0.5944868326187134,
|
|
"kl": 0.0166015625,
|
|
"learning_rate": 7.039090644965509e-07,
|
|
"loss": -1.557,
|
|
"reward": 0.4500000402331352,
|
|
"reward_std": 0.19993416219949722,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.541666679084301,
|
|
"step": 225
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1627.3334350585938,
|
|
"epoch": 0.12914285714285714,
|
|
"grad_norm": 0.7897810935974121,
|
|
"kl": 0.019805908203125,
|
|
"learning_rate": 7.009532063876148e-07,
|
|
"loss": -0.7406,
|
|
"reward": 0.6000000238418579,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.2916666865348816,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 226
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1878.7083740234375,
|
|
"epoch": 0.12971428571428573,
|
|
"grad_norm": 0.5080549716949463,
|
|
"kl": 0.01611328125,
|
|
"learning_rate": 6.979899910323624e-07,
|
|
"loss": -0.7196,
|
|
"reward": 0.6500000357627869,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.375,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 227
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1201.9167175292969,
|
|
"epoch": 0.13028571428571428,
|
|
"grad_norm": 0.7728170156478882,
|
|
"kl": 0.0109405517578125,
|
|
"learning_rate": 6.950195628537299e-07,
|
|
"loss": -0.6648,
|
|
"reward": 0.7500000596046448,
|
|
"reward_std": 0.22085529565811157,
|
|
"rewards/accuracy_reward": 0.5000000298023224,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 228
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1906.3750610351562,
|
|
"epoch": 0.13085714285714287,
|
|
"grad_norm": 0.6207183003425598,
|
|
"kl": 0.015899658203125,
|
|
"learning_rate": 6.920420666261961e-07,
|
|
"loss": -1.5599,
|
|
"reward": 0.5250000357627869,
|
|
"reward_std": 0.20463287830352783,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.625,
|
|
"step": 229
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2654.2084350585938,
|
|
"epoch": 0.13142857142857142,
|
|
"grad_norm": 0.7562258243560791,
|
|
"kl": 0.01849365234375,
|
|
"learning_rate": 6.890576474687263e-07,
|
|
"loss": -2.2566,
|
|
"reward": 0.5000000298023224,
|
|
"reward_std": 0.3759405389428139,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.6250000298023224,
|
|
"step": 230
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2595.666748046875,
|
|
"epoch": 0.132,
|
|
"grad_norm": 0.6036232709884644,
|
|
"kl": 0.024169921875,
|
|
"learning_rate": 6.860664508377001e-07,
|
|
"loss": -1.7953,
|
|
"reward": 0.3250000402331352,
|
|
"reward_std": 0.2370777204632759,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.416666679084301,
|
|
"step": 231
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2626.5001220703125,
|
|
"epoch": 0.13257142857142856,
|
|
"grad_norm": 1.021782398223877,
|
|
"kl": 0.035400390625,
|
|
"learning_rate": 6.83068622519821e-07,
|
|
"loss": -1.0878,
|
|
"reward": 0.27500003203749657,
|
|
"reward_std": 0.20463287830352783,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.3333333544433117,
|
|
"step": 232
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3502.3333740234375,
|
|
"epoch": 0.13314285714285715,
|
|
"grad_norm": 0.8032714128494263,
|
|
"kl": 0.02203369140625,
|
|
"learning_rate": 6.800643086250121e-07,
|
|
"loss": -1.0137,
|
|
"reward": 0.05000000447034836,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.0833333358168602,
|
|
"step": 233
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2883.375,
|
|
"epoch": 0.1337142857142857,
|
|
"grad_norm": 0.8170682787895203,
|
|
"kl": 0.02398681640625,
|
|
"learning_rate": 6.770536555792944e-07,
|
|
"loss": -1.9184,
|
|
"reward": 0.2500000149011612,
|
|
"reward_std": 0.22963719069957733,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.2916666716337204,
|
|
"step": 234
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1954.541748046875,
|
|
"epoch": 0.13428571428571429,
|
|
"grad_norm": 0.7082000970840454,
|
|
"kl": 0.01861572265625,
|
|
"learning_rate": 6.740368101176495e-07,
|
|
"loss": -1.5973,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.1741531491279602,
|
|
"rewards/accuracy_reward": 0.3333333544433117,
|
|
"rewards/format_reward": 0.583333358168602,
|
|
"step": 235
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1402.4166870117188,
|
|
"epoch": 0.13485714285714287,
|
|
"grad_norm": 0.6959501504898071,
|
|
"kl": 0.018798828125,
|
|
"learning_rate": 6.710139192768694e-07,
|
|
"loss": -0.742,
|
|
"reward": 0.7250000536441803,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.2500000111758709,
|
|
"rewards/format_reward": 0.9583333432674408,
|
|
"step": 236
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1551.0834350585938,
|
|
"epoch": 0.13542857142857143,
|
|
"grad_norm": 1.2837129831314087,
|
|
"kl": 0.018768310546875,
|
|
"learning_rate": 6.679851303883891e-07,
|
|
"loss": -1.248,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 237
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2601.291748046875,
|
|
"epoch": 0.136,
|
|
"grad_norm": 0.6121569275856018,
|
|
"kl": 0.02252197265625,
|
|
"learning_rate": 6.649505910711058e-07,
|
|
"loss": -0.9744,
|
|
"reward": 0.32500000298023224,
|
|
"reward_std": 0.22555401921272278,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 238
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2471.7500610351562,
|
|
"epoch": 0.13657142857142857,
|
|
"grad_norm": 0.7961202263832092,
|
|
"kl": 0.02349853515625,
|
|
"learning_rate": 6.619104492241847e-07,
|
|
"loss": -1.697,
|
|
"reward": 0.5,
|
|
"reward_std": 0.3340982347726822,
|
|
"rewards/accuracy_reward": 0.291666679084301,
|
|
"rewards/format_reward": 0.5416666716337204,
|
|
"step": 239
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2069.291748046875,
|
|
"epoch": 0.13714285714285715,
|
|
"grad_norm": 26.108394622802734,
|
|
"kl": 0.1033935546875,
|
|
"learning_rate": 6.588648530198504e-07,
|
|
"loss": -1.4725,
|
|
"reward": 0.40000002086162567,
|
|
"reward_std": 0.17232800275087357,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.6250000149011612,
|
|
"step": 240
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2281.0833435058594,
|
|
"epoch": 0.1377142857142857,
|
|
"grad_norm": 0.651018500328064,
|
|
"kl": 0.02239990234375,
|
|
"learning_rate": 6.558139508961654e-07,
|
|
"loss": -0.6258,
|
|
"reward": 0.6000000759959221,
|
|
"reward_std": 0.19993416219949722,
|
|
"rewards/accuracy_reward": 0.4583333544433117,
|
|
"rewards/format_reward": 0.5416666679084301,
|
|
"step": 241
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2965.625,
|
|
"epoch": 0.1382857142857143,
|
|
"grad_norm": 0.9381749033927917,
|
|
"kl": 0.02398681640625,
|
|
"learning_rate": 6.527578915497951e-07,
|
|
"loss": -1.8083,
|
|
"reward": 0.15000001341104507,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.2500000074505806,
|
|
"step": 242
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2690.166748046875,
|
|
"epoch": 0.13885714285714285,
|
|
"grad_norm": 0.735163152217865,
|
|
"kl": 0.02117919921875,
|
|
"learning_rate": 6.496968239287603e-07,
|
|
"loss": -1.3621,
|
|
"reward": 0.2250000163912773,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.3750000223517418,
|
|
"step": 243
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2612.5416870117188,
|
|
"epoch": 0.13942857142857143,
|
|
"grad_norm": 0.8661508560180664,
|
|
"kl": 0.0235595703125,
|
|
"learning_rate": 6.466308972251785e-07,
|
|
"loss": -1.5249,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.2323790192604065,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 244
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2336.8751220703125,
|
|
"epoch": 0.14,
|
|
"grad_norm": 0.9766954779624939,
|
|
"kl": 0.019775390625,
|
|
"learning_rate": 6.435602608679916e-07,
|
|
"loss": -1.3257,
|
|
"reward": 0.3500000238418579,
|
|
"reward_std": 0.2566385716199875,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 245
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2764.25,
|
|
"epoch": 0.14057142857142857,
|
|
"grad_norm": 0.5228642821311951,
|
|
"kl": 0.01849365234375,
|
|
"learning_rate": 6.404850645156841e-07,
|
|
"loss": -0.5755,
|
|
"reward": 0.2250000163912773,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 246
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3086.0001220703125,
|
|
"epoch": 0.14114285714285715,
|
|
"grad_norm": 0.7841210961341858,
|
|
"kl": 0.023162841796875,
|
|
"learning_rate": 6.374054580489873e-07,
|
|
"loss": -2.1551,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.39986832439899445,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.4166666716337204,
|
|
"step": 247
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1750.7083740234375,
|
|
"epoch": 0.1417142857142857,
|
|
"grad_norm": 0.7208341360092163,
|
|
"kl": 0.017120361328125,
|
|
"learning_rate": 6.343215915635761e-07,
|
|
"loss": -0.7416,
|
|
"reward": 0.7250000536441803,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.5,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 248
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2371.8751220703125,
|
|
"epoch": 0.1422857142857143,
|
|
"grad_norm": 0.6523035168647766,
|
|
"kl": 0.018218994140625,
|
|
"learning_rate": 6.31233615362752e-07,
|
|
"loss": -0.7417,
|
|
"reward": 0.2750000059604645,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 249
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2791.416748046875,
|
|
"epoch": 0.14285714285714285,
|
|
"grad_norm": 0.7076215147972107,
|
|
"kl": 0.01898193359375,
|
|
"learning_rate": 6.281416799501187e-07,
|
|
"loss": -0.4655,
|
|
"reward": 0.32500001415610313,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.2916666679084301,
|
|
"step": 250
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1733.0000610351562,
|
|
"epoch": 0.14342857142857143,
|
|
"grad_norm": 0.5396320819854736,
|
|
"kl": 0.0194091796875,
|
|
"learning_rate": 6.25045936022246e-07,
|
|
"loss": -0.7209,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.17232800275087357,
|
|
"rewards/accuracy_reward": 0.2083333395421505,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 251
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2229.7501220703125,
|
|
"epoch": 0.144,
|
|
"grad_norm": 1.1370456218719482,
|
|
"kl": 0.029296875,
|
|
"learning_rate": 6.219465344613258e-07,
|
|
"loss": -2.3579,
|
|
"reward": 0.30000002682209015,
|
|
"reward_std": 0.24978766590356827,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.458333358168602,
|
|
"step": 252
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2835.25,
|
|
"epoch": 0.14457142857142857,
|
|
"grad_norm": 0.5109323859214783,
|
|
"kl": 0.0166015625,
|
|
"learning_rate": 6.188436263278172e-07,
|
|
"loss": 0.0027,
|
|
"reward": 0.17500001192092896,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 253
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2688.6250610351562,
|
|
"epoch": 0.14514285714285713,
|
|
"grad_norm": 0.5289558172225952,
|
|
"kl": 0.01513671875,
|
|
"learning_rate": 6.157373628530852e-07,
|
|
"loss": -1.8877,
|
|
"reward": 0.40000002086162567,
|
|
"reward_std": 0.36425092816352844,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 254
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2283.25,
|
|
"epoch": 0.1457142857142857,
|
|
"grad_norm": 0.7051854729652405,
|
|
"kl": 0.01953125,
|
|
"learning_rate": 6.126278954320294e-07,
|
|
"loss": -1.7114,
|
|
"reward": 0.5,
|
|
"reward_std": 0.21162375062704086,
|
|
"rewards/accuracy_reward": 0.3333333358168602,
|
|
"rewards/format_reward": 0.5000000149011612,
|
|
"step": 255
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3230.7501220703125,
|
|
"epoch": 0.1462857142857143,
|
|
"grad_norm": 0.6201116442680359,
|
|
"kl": 0.0255126953125,
|
|
"learning_rate": 6.095153756157051e-07,
|
|
"loss": -1.4431,
|
|
"reward": 0.10000000894069672,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.1666666716337204,
|
|
"step": 256
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2076.9166870117188,
|
|
"epoch": 0.14685714285714285,
|
|
"grad_norm": 0.9742373824119568,
|
|
"kl": 0.02349853515625,
|
|
"learning_rate": 6.06399955103937e-07,
|
|
"loss": -1.6298,
|
|
"reward": 0.5000000298023224,
|
|
"reward_std": 0.27253394573926926,
|
|
"rewards/accuracy_reward": 0.2500000074505806,
|
|
"rewards/format_reward": 0.5833333432674408,
|
|
"step": 257
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1520.791748046875,
|
|
"epoch": 0.14742857142857144,
|
|
"grad_norm": 0.6934832334518433,
|
|
"kl": 0.011260986328125,
|
|
"learning_rate": 6.032817857379256e-07,
|
|
"loss": -0.7432,
|
|
"reward": 0.5250000357627869,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 258
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2571.0,
|
|
"epoch": 0.148,
|
|
"grad_norm": 0.5018502473831177,
|
|
"kl": 0.01641845703125,
|
|
"learning_rate": 6.001610194928464e-07,
|
|
"loss": -0.8253,
|
|
"reward": 0.27500003576278687,
|
|
"reward_std": 0.14747881889343262,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 259
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3117.416748046875,
|
|
"epoch": 0.14857142857142858,
|
|
"grad_norm": 134.84226989746094,
|
|
"kl": 1.43408203125,
|
|
"learning_rate": 5.97037808470444e-07,
|
|
"loss": -1.1063,
|
|
"reward": 0.22500000894069672,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 260
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3584.0,
|
|
"epoch": 0.14914285714285713,
|
|
"grad_norm": 0.3973945677280426,
|
|
"kl": 0.0257568359375,
|
|
"learning_rate": 5.939123048916173e-07,
|
|
"loss": -0.5713,
|
|
"reward": 0.02500000223517418,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.0416666679084301,
|
|
"step": 261
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2326.916748046875,
|
|
"epoch": 0.14971428571428572,
|
|
"grad_norm": 0.5301626324653625,
|
|
"kl": 0.019775390625,
|
|
"learning_rate": 5.907846610890011e-07,
|
|
"loss": -0.7393,
|
|
"reward": 0.4500000476837158,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.2083333395421505,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 262
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2577.666748046875,
|
|
"epoch": 0.15028571428571427,
|
|
"grad_norm": 0.6672086715698242,
|
|
"kl": 0.02447509765625,
|
|
"learning_rate": 5.87655029499542e-07,
|
|
"loss": -0.7189,
|
|
"reward": 0.32500000298023224,
|
|
"reward_std": 0.1596180573105812,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 263
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3584.0,
|
|
"epoch": 0.15085714285714286,
|
|
"grad_norm": 0.40429234504699707,
|
|
"kl": 0.02337646484375,
|
|
"learning_rate": 5.845235626570683e-07,
|
|
"loss": 0.0037,
|
|
"reward": 0.0,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.0,
|
|
"step": 264
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3360.25,
|
|
"epoch": 0.15142857142857144,
|
|
"grad_norm": 0.6446682214736938,
|
|
"kl": 0.025390625,
|
|
"learning_rate": 5.813904131848564e-07,
|
|
"loss": -0.616,
|
|
"reward": 0.05000000447034836,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.0833333358168602,
|
|
"step": 265
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2176.416748046875,
|
|
"epoch": 0.152,
|
|
"grad_norm": 0.7718228101730347,
|
|
"kl": 0.031494140625,
|
|
"learning_rate": 5.78255733788191e-07,
|
|
"loss": -0.4889,
|
|
"reward": 0.3750000149011612,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 266
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1419.5416870117188,
|
|
"epoch": 0.15257142857142858,
|
|
"grad_norm": 0.5885297656059265,
|
|
"kl": 0.018798828125,
|
|
"learning_rate": 5.751196772469237e-07,
|
|
"loss": 0.003,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 267
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2091.0000610351562,
|
|
"epoch": 0.15314285714285714,
|
|
"grad_norm": 0.7558290958404541,
|
|
"kl": 0.01812744140625,
|
|
"learning_rate": 5.71982396408026e-07,
|
|
"loss": -0.5488,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 268
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 3082.3333740234375,
|
|
"epoch": 0.15371428571428572,
|
|
"grad_norm": 0.9141106009483337,
|
|
"kl": 0.021484375,
|
|
"learning_rate": 5.688440441781398e-07,
|
|
"loss": -2.2151,
|
|
"reward": 0.15000001341104507,
|
|
"reward_std": 0.2323790118098259,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.2500000074505806,
|
|
"step": 269
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1514.1666870117188,
|
|
"epoch": 0.15428571428571428,
|
|
"grad_norm": 0.8448731899261475,
|
|
"kl": 0.01568603515625,
|
|
"learning_rate": 5.657047735161255e-07,
|
|
"loss": -0.7411,
|
|
"reward": 0.675000011920929,
|
|
"reward_std": 0.13869690895080566,
|
|
"rewards/accuracy_reward": 0.4166666716337204,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 270
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1341.916748046875,
|
|
"epoch": 0.15485714285714286,
|
|
"grad_norm": 1.3789052963256836,
|
|
"kl": 0.020538330078125,
|
|
"learning_rate": 5.625647374256061e-07,
|
|
"loss": -0.7409,
|
|
"reward": 0.6500000357627869,
|
|
"reward_std": 0.19993415474891663,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.9583333432674408,
|
|
"step": 271
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2583.7083740234375,
|
|
"epoch": 0.15542857142857142,
|
|
"grad_norm": 1.2276079654693604,
|
|
"kl": 0.0244140625,
|
|
"learning_rate": 5.594240889475106e-07,
|
|
"loss": -2.3455,
|
|
"reward": 0.40000002086162567,
|
|
"reward_std": 0.36425092816352844,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 272
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2224.0,
|
|
"epoch": 0.156,
|
|
"grad_norm": 0.45050886273384094,
|
|
"kl": 0.022003173828125,
|
|
"learning_rate": 5.562829811526154e-07,
|
|
"loss": 0.0035,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.2500000074505806,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 273
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2945.0833740234375,
|
|
"epoch": 0.15657142857142858,
|
|
"grad_norm": 0.7557133436203003,
|
|
"kl": 0.01849365234375,
|
|
"learning_rate": 5.531415671340826e-07,
|
|
"loss": 0.003,
|
|
"reward": 0.2250000238418579,
|
|
"reward_std": 0.08215838670730591,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 274
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1650.8333740234375,
|
|
"epoch": 0.15714285714285714,
|
|
"grad_norm": 0.675412654876709,
|
|
"kl": 0.014434814453125,
|
|
"learning_rate": 5.5e-07,
|
|
"loss": -0.7416,
|
|
"reward": 0.5000000149011612,
|
|
"reward_std": 0.19993415474891663,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 275
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1545.3334350585938,
|
|
"epoch": 0.15771428571428572,
|
|
"grad_norm": 0.7588775157928467,
|
|
"kl": 0.018646240234375,
|
|
"learning_rate": 5.468584328659172e-07,
|
|
"loss": -1.4864,
|
|
"reward": 0.9750000238418579,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.7083333432674408,
|
|
"rewards/format_reward": 0.9166666865348816,
|
|
"step": 276
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2915.791748046875,
|
|
"epoch": 0.15828571428571428,
|
|
"grad_norm": 1.0028012990951538,
|
|
"kl": 0.0211181640625,
|
|
"learning_rate": 5.437170188473847e-07,
|
|
"loss": -1.5921,
|
|
"reward": 0.3499999940395355,
|
|
"reward_std": 0.26039472222328186,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 277
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2256.25,
|
|
"epoch": 0.15885714285714286,
|
|
"grad_norm": 0.644159734249115,
|
|
"kl": 0.015411376953125,
|
|
"learning_rate": 5.405759110524894e-07,
|
|
"loss": 0.0025,
|
|
"reward": 0.40000003576278687,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 278
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2127.9584350585938,
|
|
"epoch": 0.15942857142857142,
|
|
"grad_norm": 1.0713709592819214,
|
|
"kl": 0.0250244140625,
|
|
"learning_rate": 5.37435262574394e-07,
|
|
"loss": -2.5405,
|
|
"reward": 0.5500000268220901,
|
|
"reward_std": 0.2861757129430771,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 279
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2087.2083740234375,
|
|
"epoch": 0.16,
|
|
"grad_norm": 0.44367942214012146,
|
|
"kl": 0.014678955078125,
|
|
"learning_rate": 5.342952264838747e-07,
|
|
"loss": -0.7211,
|
|
"reward": 0.5000000298023224,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.625,
|
|
"step": 280
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1533.0417175292969,
|
|
"epoch": 0.16057142857142856,
|
|
"grad_norm": 1.1346826553344727,
|
|
"kl": 0.015472412109375,
|
|
"learning_rate": 5.311559558218603e-07,
|
|
"loss": -0.7422,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.6666666865348816,
|
|
"step": 281
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2707.9166870117188,
|
|
"epoch": 0.16114285714285714,
|
|
"grad_norm": 0.7678596377372742,
|
|
"kl": 0.0263671875,
|
|
"learning_rate": 5.28017603591974e-07,
|
|
"loss": -0.7124,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.20871606469154358,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.375,
|
|
"step": 282
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2104.8333740234375,
|
|
"epoch": 0.16171428571428573,
|
|
"grad_norm": 0.3849073052406311,
|
|
"kl": 0.01458740234375,
|
|
"learning_rate": 5.248803227530763e-07,
|
|
"loss": 0.0023,
|
|
"reward": 0.32500000298023224,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.5,
|
|
"step": 283
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2547.9166870117188,
|
|
"epoch": 0.16228571428571428,
|
|
"grad_norm": 1.1764817237854004,
|
|
"kl": 0.01708984375,
|
|
"learning_rate": 5.21744266211809e-07,
|
|
"loss": -2.3522,
|
|
"reward": 0.5750000178813934,
|
|
"reward_std": 0.3804733455181122,
|
|
"rewards/accuracy_reward": 0.4583333432674408,
|
|
"rewards/format_reward": 0.5000000298023224,
|
|
"step": 284
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2855.8333740234375,
|
|
"epoch": 0.16285714285714287,
|
|
"grad_norm": 0.8596624135971069,
|
|
"kl": 0.0228271484375,
|
|
"learning_rate": 5.186095868151436e-07,
|
|
"loss": -1.918,
|
|
"reward": 0.2500000111758709,
|
|
"reward_std": 0.2773938328027725,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.3333333544433117,
|
|
"step": 285
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2617.541748046875,
|
|
"epoch": 0.16342857142857142,
|
|
"grad_norm": 1.1564656496047974,
|
|
"kl": 0.0211181640625,
|
|
"learning_rate": 5.154764373429315e-07,
|
|
"loss": -2.6721,
|
|
"reward": 0.6750000417232513,
|
|
"reward_std": 0.37711599469184875,
|
|
"rewards/accuracy_reward": 0.5000000149011612,
|
|
"rewards/format_reward": 0.625,
|
|
"step": 286
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1809.9584350585938,
|
|
"epoch": 0.164,
|
|
"grad_norm": 0.8912317156791687,
|
|
"kl": 0.017974853515625,
|
|
"learning_rate": 5.123449705004581e-07,
|
|
"loss": -1.9258,
|
|
"reward": 0.5750000476837158,
|
|
"reward_std": 0.3480285108089447,
|
|
"rewards/accuracy_reward": 0.2500000111758709,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 287
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2473.6666870117188,
|
|
"epoch": 0.16457142857142856,
|
|
"grad_norm": 0.6116588115692139,
|
|
"kl": 0.01898193359375,
|
|
"learning_rate": 5.09215338910999e-07,
|
|
"loss": -0.8855,
|
|
"reward": 0.375,
|
|
"reward_std": 0.13869690895080566,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.4166666865348816,
|
|
"step": 288
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1470.0416870117188,
|
|
"epoch": 0.16514285714285715,
|
|
"grad_norm": 0.5574433207511902,
|
|
"kl": 0.017059326171875,
|
|
"learning_rate": 5.060876951083828e-07,
|
|
"loss": -0.7664,
|
|
"reward": 0.6250000298023224,
|
|
"reward_std": 0.21615658700466156,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.9166666865348816,
|
|
"step": 289
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1707.041748046875,
|
|
"epoch": 0.1657142857142857,
|
|
"grad_norm": 1.1696451902389526,
|
|
"kl": 0.0301513671875,
|
|
"learning_rate": 5.02962191529556e-07,
|
|
"loss": -1.657,
|
|
"reward": 0.5250000208616257,
|
|
"reward_std": 0.23356524109840393,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 290
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2784.0001220703125,
|
|
"epoch": 0.1662857142857143,
|
|
"grad_norm": 1.8924946784973145,
|
|
"kl": 0.0260009765625,
|
|
"learning_rate": 4.998389805071536e-07,
|
|
"loss": -2.9789,
|
|
"reward": 0.32500001788139343,
|
|
"reward_std": 0.33025872707366943,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.4583333432674408,
|
|
"step": 291
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1551.4166870117188,
|
|
"epoch": 0.16685714285714287,
|
|
"grad_norm": 1.2713048458099365,
|
|
"kl": 0.01983642578125,
|
|
"learning_rate": 4.967182142620745e-07,
|
|
"loss": -2.4029,
|
|
"reward": 0.5250000208616257,
|
|
"reward_std": 0.27286098897457123,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 292
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1370.5417175292969,
|
|
"epoch": 0.16742857142857143,
|
|
"grad_norm": 0.436497300863266,
|
|
"kl": 0.013275146484375,
|
|
"learning_rate": 4.93600044896063e-07,
|
|
"loss": 0.0021,
|
|
"reward": 0.6750000417232513,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.3750000223517418,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 293
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2411.7500610351562,
|
|
"epoch": 0.168,
|
|
"grad_norm": 0.9047071933746338,
|
|
"kl": 0.021728515625,
|
|
"learning_rate": 4.904846243842949e-07,
|
|
"loss": -1.6188,
|
|
"reward": 0.30000003427267075,
|
|
"reward_std": 0.1549193412065506,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.5000000223517418,
|
|
"step": 294
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2314.9583740234375,
|
|
"epoch": 0.16857142857142857,
|
|
"grad_norm": 0.8080445528030396,
|
|
"kl": 0.02862548828125,
|
|
"learning_rate": 4.873721045679706e-07,
|
|
"loss": -2.1152,
|
|
"reward": 0.30000003427267075,
|
|
"reward_std": 0.19993416219949722,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.5000000223517418,
|
|
"step": 295
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2848.9583740234375,
|
|
"epoch": 0.16914285714285715,
|
|
"grad_norm": 0.5764908790588379,
|
|
"kl": 0.02508544921875,
|
|
"learning_rate": 4.842626371469149e-07,
|
|
"loss": -0.6015,
|
|
"reward": 0.17500000819563866,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.25,
|
|
"step": 296
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2570.166748046875,
|
|
"epoch": 0.1697142857142857,
|
|
"grad_norm": 1.0034595727920532,
|
|
"kl": 0.02734375,
|
|
"learning_rate": 4.811563736721829e-07,
|
|
"loss": -2.3512,
|
|
"reward": 0.45000000298023224,
|
|
"reward_std": 0.2658701241016388,
|
|
"rewards/accuracy_reward": 0.3333333544433117,
|
|
"rewards/format_reward": 0.4166666865348816,
|
|
"step": 297
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1806.4584350585938,
|
|
"epoch": 0.1702857142857143,
|
|
"grad_norm": 0.8656820058822632,
|
|
"kl": 0.0198974609375,
|
|
"learning_rate": 4.780534655386743e-07,
|
|
"loss": -1.6228,
|
|
"reward": 0.5250000506639481,
|
|
"reward_std": 0.29361625015735626,
|
|
"rewards/accuracy_reward": 0.2083333432674408,
|
|
"rewards/format_reward": 0.6666666716337204,
|
|
"step": 298
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1468.416748046875,
|
|
"epoch": 0.17085714285714285,
|
|
"grad_norm": 1.4623134136199951,
|
|
"kl": 0.025634765625,
|
|
"learning_rate": 4.749540639777539e-07,
|
|
"loss": -2.5606,
|
|
"reward": 0.4750000089406967,
|
|
"reward_std": 0.31102490425109863,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 299
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2191.3333740234375,
|
|
"epoch": 0.17142857142857143,
|
|
"grad_norm": 0.7832638621330261,
|
|
"kl": 0.02301025390625,
|
|
"learning_rate": 4.7185832004988133e-07,
|
|
"loss": -1.4816,
|
|
"reward": 0.32500001788139343,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 300
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2002.916748046875,
|
|
"epoch": 0.172,
|
|
"grad_norm": 0.4890463948249817,
|
|
"kl": 0.013916015625,
|
|
"learning_rate": 1.5267358321348285e-07,
|
|
"loss": -0.6248,
|
|
"reward": 0.3500000238418579,
|
|
"reward_std": 0.0774596706032753,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.5833333432674408,
|
|
"step": 301
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1066.3333435058594,
|
|
"epoch": 0.17257142857142857,
|
|
"grad_norm": 1.2706708908081055,
|
|
"kl": 0.01739501953125,
|
|
"learning_rate": 1.5058639494795067e-07,
|
|
"loss": -1.6018,
|
|
"reward": 0.7750000357627869,
|
|
"reward_std": 0.20295868068933487,
|
|
"rewards/accuracy_reward": 0.4166666716337204,
|
|
"rewards/format_reward": 0.8750000298023224,
|
|
"step": 302
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1463.7083435058594,
|
|
"epoch": 0.17314285714285715,
|
|
"grad_norm": 0.5108232498168945,
|
|
"kl": 0.01483154296875,
|
|
"learning_rate": 1.485389347912525e-07,
|
|
"loss": -0.8718,
|
|
"reward": 0.6500000357627869,
|
|
"reward_std": 0.22085529565811157,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.9166666865348816,
|
|
"step": 303
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2589.916748046875,
|
|
"epoch": 0.1737142857142857,
|
|
"grad_norm": 1.3276212215423584,
|
|
"kl": 0.013671875,
|
|
"learning_rate": 1.4653140639624066e-07,
|
|
"loss": -3.066,
|
|
"reward": 0.45000001788139343,
|
|
"reward_std": 0.43685072660446167,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.5833333432674408,
|
|
"step": 304
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2135.0,
|
|
"epoch": 0.1742857142857143,
|
|
"grad_norm": 1.30924654006958,
|
|
"kl": 0.012664794921875,
|
|
"learning_rate": 1.4456400944391144e-07,
|
|
"loss": -0.9444,
|
|
"reward": 0.3750000223517418,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.5833333358168602,
|
|
"step": 305
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1213.0833740234375,
|
|
"epoch": 0.17485714285714285,
|
|
"grad_norm": 1.3626219034194946,
|
|
"kl": 0.02374267578125,
|
|
"learning_rate": 1.4263693962354336e-07,
|
|
"loss": -1.6655,
|
|
"reward": 0.7500000596046448,
|
|
"reward_std": 0.3175487816333771,
|
|
"rewards/accuracy_reward": 0.4166666865348816,
|
|
"rewards/format_reward": 0.8333333432674408,
|
|
"step": 306
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2899.25,
|
|
"epoch": 0.17542857142857143,
|
|
"grad_norm": 0.6594395041465759,
|
|
"kl": 0.01434326171875,
|
|
"learning_rate": 1.4075038861323302e-07,
|
|
"loss": -0.8706,
|
|
"reward": 0.27500003576278687,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.125,
|
|
"rewards/format_reward": 0.3333333432674408,
|
|
"step": 307
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1401.2500915527344,
|
|
"epoch": 0.176,
|
|
"grad_norm": 0.5402439832687378,
|
|
"kl": 0.016876220703125,
|
|
"learning_rate": 1.3890454406082956e-07,
|
|
"loss": 0.0028,
|
|
"reward": 0.9000000357627869,
|
|
"reward_std": 0.0,
|
|
"rewards/accuracy_reward": 0.75,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 308
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2200.416748046875,
|
|
"epoch": 0.17657142857142857,
|
|
"grad_norm": 0.667547881603241,
|
|
"kl": 0.024871826171875,
|
|
"learning_rate": 1.3709958956526974e-07,
|
|
"loss": -1.5809,
|
|
"reward": 0.32500000298023224,
|
|
"reward_std": 0.1596180498600006,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 309
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2872.5833740234375,
|
|
"epoch": 0.17714285714285713,
|
|
"grad_norm": 0.6462530493736267,
|
|
"kl": 0.014068603515625,
|
|
"learning_rate": 1.353357046583165e-07,
|
|
"loss": -1.4915,
|
|
"reward": 0.30000001192092896,
|
|
"reward_std": 0.2258318066596985,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.3750000149011612,
|
|
"step": 310
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2144.3333740234375,
|
|
"epoch": 0.1777142857142857,
|
|
"grad_norm": 0.760235607624054,
|
|
"kl": 0.012969970703125,
|
|
"learning_rate": 1.3361306478670148e-07,
|
|
"loss": -2.1933,
|
|
"reward": 0.3500000089406967,
|
|
"reward_std": 0.23826396465301514,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 311
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1499.1250610351562,
|
|
"epoch": 0.1782857142857143,
|
|
"grad_norm": 1.0971490144729614,
|
|
"kl": 0.02130126953125,
|
|
"learning_rate": 1.3193184129467384e-07,
|
|
"loss": -0.6513,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.06123724579811096,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 312
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1944.8751220703125,
|
|
"epoch": 0.17885714285714285,
|
|
"grad_norm": 0.6792640089988708,
|
|
"kl": 0.01654052734375,
|
|
"learning_rate": 1.3029220140695756e-07,
|
|
"loss": -0.4889,
|
|
"reward": 0.3750000298023224,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.5416666865348816,
|
|
"step": 313
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1524.5000610351562,
|
|
"epoch": 0.17942857142857144,
|
|
"grad_norm": 1.003913402557373,
|
|
"kl": 0.015625,
|
|
"learning_rate": 1.2869430821211826e-07,
|
|
"loss": -1.2384,
|
|
"reward": 0.7500000298023224,
|
|
"reward_std": 0.2173428237438202,
|
|
"rewards/accuracy_reward": 0.5416666865348816,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 314
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2062.4166870117188,
|
|
"epoch": 0.18,
|
|
"grad_norm": 0.8467549085617065,
|
|
"kl": 0.01544189453125,
|
|
"learning_rate": 1.2713832064634125e-07,
|
|
"loss": -1.1761,
|
|
"reward": 0.40000002086162567,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.6666666716337204,
|
|
"step": 315
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1394.25,
|
|
"epoch": 0.18057142857142858,
|
|
"grad_norm": 1.1105024814605713,
|
|
"kl": 0.021453857421875,
|
|
"learning_rate": 1.2562439347762275e-07,
|
|
"loss": -1.5211,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.19993415474891663,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.875,
|
|
"step": 316
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1474.3333740234375,
|
|
"epoch": 0.18114285714285713,
|
|
"grad_norm": 0.979469358921051,
|
|
"kl": 0.011138916015625,
|
|
"learning_rate": 1.2415267729037608e-07,
|
|
"loss": -2.2617,
|
|
"reward": 0.6500000059604645,
|
|
"reward_std": 0.3548534959554672,
|
|
"rewards/accuracy_reward": 0.2916666716337204,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 317
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1628.416748046875,
|
|
"epoch": 0.18171428571428572,
|
|
"grad_norm": 0.563599705696106,
|
|
"kl": 0.01849365234375,
|
|
"learning_rate": 1.2272331847045313e-07,
|
|
"loss": 0.003,
|
|
"reward": 0.7500000894069672,
|
|
"reward_std": 0.19993416219949722,
|
|
"rewards/accuracy_reward": 0.5000000223517418,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 318
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1559.5833435058594,
|
|
"epoch": 0.18228571428571427,
|
|
"grad_norm": 0.9234899282455444,
|
|
"kl": 0.01715087890625,
|
|
"learning_rate": 1.2133645919058418e-07,
|
|
"loss": -1.6808,
|
|
"reward": 0.6250000149011612,
|
|
"reward_std": 0.19037556648254395,
|
|
"rewards/accuracy_reward": 0.2916666679084301,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 319
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1388.8333740234375,
|
|
"epoch": 0.18285714285714286,
|
|
"grad_norm": 1.0862469673156738,
|
|
"kl": 0.020263671875,
|
|
"learning_rate": 1.1999223739623666e-07,
|
|
"loss": -1.6945,
|
|
"reward": 0.6000000089406967,
|
|
"reward_std": 0.2658701241016388,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.8333333432674408,
|
|
"step": 320
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2178.0001220703125,
|
|
"epoch": 0.18342857142857144,
|
|
"grad_norm": 1.5233582258224487,
|
|
"kl": 0.022247314453125,
|
|
"learning_rate": 1.1869078679189393e-07,
|
|
"loss": -1.4616,
|
|
"reward": 0.32500001788139343,
|
|
"reward_std": 0.21615657955408096,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.458333358168602,
|
|
"step": 321
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 2045.166748046875,
|
|
"epoch": 0.184,
|
|
"grad_norm": 1.028342843055725,
|
|
"kl": 0.01947021484375,
|
|
"learning_rate": 1.1743223682775649e-07,
|
|
"loss": -1.4788,
|
|
"reward": 0.40000002086162567,
|
|
"reward_std": 0.23826396465301514,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.5833333432674408,
|
|
"step": 322
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1766.9583740234375,
|
|
"epoch": 0.18457142857142858,
|
|
"grad_norm": 1.2755353450775146,
|
|
"kl": 0.0216064453125,
|
|
"learning_rate": 1.1621671268686605e-07,
|
|
"loss": -2.4408,
|
|
"reward": 0.5000000298023224,
|
|
"reward_std": 0.3290724903345108,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 323
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1399.8750610351562,
|
|
"epoch": 0.18514285714285714,
|
|
"grad_norm": 0.7948786020278931,
|
|
"kl": 0.01861572265625,
|
|
"learning_rate": 1.1504433527265378e-07,
|
|
"loss": -1.615,
|
|
"reward": 0.6000000089406967,
|
|
"reward_std": 0.31572362780570984,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.8333333432674408,
|
|
"step": 324
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1793.8334350585938,
|
|
"epoch": 0.18571428571428572,
|
|
"grad_norm": 1.1149201393127441,
|
|
"kl": 0.02008056640625,
|
|
"learning_rate": 1.1391522119691496e-07,
|
|
"loss": -2.113,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.3386310636997223,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.6666666865348816,
|
|
"step": 325
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1259.4583740234375,
|
|
"epoch": 0.18628571428571428,
|
|
"grad_norm": 0.9218916296958923,
|
|
"kl": 0.024658203125,
|
|
"learning_rate": 1.1282948276820962e-07,
|
|
"loss": -0.7426,
|
|
"reward": 0.6000000536441803,
|
|
"reward_std": 0.22085530310869217,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.8333333432674408,
|
|
"step": 326
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1897.2083740234375,
|
|
"epoch": 0.18685714285714286,
|
|
"grad_norm": 1.3489266633987427,
|
|
"kl": 0.02215576171875,
|
|
"learning_rate": 1.1178722798069215e-07,
|
|
"loss": -2.2226,
|
|
"reward": 0.5000000149011612,
|
|
"reward_std": 0.2773938328027725,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.6666666865348816,
|
|
"step": 327
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1416.5833435058594,
|
|
"epoch": 0.18742857142857142,
|
|
"grad_norm": 0.8123278617858887,
|
|
"kl": 0.02020263671875,
|
|
"learning_rate": 1.10788560503369e-07,
|
|
"loss": 0.0032,
|
|
"reward": 0.5500000268220901,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 328
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1962.4583740234375,
|
|
"epoch": 0.188,
|
|
"grad_norm": 0.5757811665534973,
|
|
"kl": 0.019775390625,
|
|
"learning_rate": 1.0983357966978745e-07,
|
|
"loss": -0.9902,
|
|
"reward": 0.42499999701976776,
|
|
"reward_std": 0.20463287830352783,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.625,
|
|
"step": 329
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1766.2500610351562,
|
|
"epoch": 0.18857142857142858,
|
|
"grad_norm": 0.9221534132957458,
|
|
"kl": 0.01800537109375,
|
|
"learning_rate": 1.0892238046815527e-07,
|
|
"loss": -1.6717,
|
|
"reward": 0.5500000268220901,
|
|
"reward_std": 0.2983149588108063,
|
|
"rewards/accuracy_reward": 0.2083333395421505,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 330
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1579.5833435058594,
|
|
"epoch": 0.18914285714285714,
|
|
"grad_norm": 0.9036383628845215,
|
|
"kl": 0.01678466796875,
|
|
"learning_rate": 1.0805505353189254e-07,
|
|
"loss": -2.0184,
|
|
"reward": 0.5750000476837158,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.7083333432674408,
|
|
"step": 331
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1057.0,
|
|
"epoch": 0.18971428571428572,
|
|
"grad_norm": 0.5648021697998047,
|
|
"kl": 0.010589599609375,
|
|
"learning_rate": 1.0723168513061665e-07,
|
|
"loss": -0.7254,
|
|
"reward": 0.7000000476837158,
|
|
"reward_std": 0.17232800275087357,
|
|
"rewards/accuracy_reward": 0.2083333395421505,
|
|
"rewards/format_reward": 0.9583333432674408,
|
|
"step": 332
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1274.291748046875,
|
|
"epoch": 0.19028571428571428,
|
|
"grad_norm": 0.8038429617881775,
|
|
"kl": 0.01422119140625,
|
|
"learning_rate": 1.0645235716156168e-07,
|
|
"loss": -1.4998,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.17232800275087357,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.8750000298023224,
|
|
"step": 333
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1840.2084350585938,
|
|
"epoch": 0.19085714285714286,
|
|
"grad_norm": 0.7369369864463806,
|
|
"kl": 0.01922607421875,
|
|
"learning_rate": 1.0571714714143197e-07,
|
|
"loss": -0.7416,
|
|
"reward": 0.5750000476837158,
|
|
"reward_std": 0.18371173739433289,
|
|
"rewards/accuracy_reward": 0.2916666865348816,
|
|
"rewards/format_reward": 0.6666666865348816,
|
|
"step": 334
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1033.6666870117188,
|
|
"epoch": 0.19142857142857142,
|
|
"grad_norm": 0.9152414202690125,
|
|
"kl": 0.012664794921875,
|
|
"learning_rate": 1.0502612819869216e-07,
|
|
"loss": -0.759,
|
|
"reward": 0.5750000476837158,
|
|
"reward_std": 0.11291590332984924,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.9166666865348816,
|
|
"step": 335
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 929.3333435058594,
|
|
"epoch": 0.192,
|
|
"grad_norm": 0.354445219039917,
|
|
"kl": 0.016204833984375,
|
|
"learning_rate": 1.0437936906629334e-07,
|
|
"loss": 0.0026,
|
|
"reward": 0.7750000357627869,
|
|
"reward_std": 0.1596180573105812,
|
|
"rewards/accuracy_reward": 0.2916666716337204,
|
|
"rewards/format_reward": 1.0,
|
|
"step": 336
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1763.0000915527344,
|
|
"epoch": 0.19257142857142856,
|
|
"grad_norm": 1.0156564712524414,
|
|
"kl": 0.015411376953125,
|
|
"learning_rate": 1.0377693407483638e-07,
|
|
"loss": -1.6813,
|
|
"reward": 0.6500000357627869,
|
|
"reward_std": 0.2658701241016388,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.8333333432674408,
|
|
"step": 337
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1598.7083435058594,
|
|
"epoch": 0.19314285714285714,
|
|
"grad_norm": 0.8538982272148132,
|
|
"kl": 0.021728515625,
|
|
"learning_rate": 1.032188831461732e-07,
|
|
"loss": -1.2843,
|
|
"reward": 0.7000000178813934,
|
|
"reward_std": 0.2658701241016388,
|
|
"rewards/accuracy_reward": 0.4166666865348816,
|
|
"rewards/format_reward": 0.7500000298023224,
|
|
"step": 338
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1587.2083740234375,
|
|
"epoch": 0.19371428571428573,
|
|
"grad_norm": 1.12893545627594,
|
|
"kl": 0.01947021484375,
|
|
"learning_rate": 1.0270527178744664e-07,
|
|
"loss": -1.5945,
|
|
"reward": 0.5000000447034836,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.8333333432674408,
|
|
"step": 339
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1353.5833740234375,
|
|
"epoch": 0.19428571428571428,
|
|
"grad_norm": 0.6671582460403442,
|
|
"kl": 0.01910400390625,
|
|
"learning_rate": 1.0223615108556937e-07,
|
|
"loss": -0.7477,
|
|
"reward": 0.6000000536441803,
|
|
"reward_std": 0.20871604979038239,
|
|
"rewards/accuracy_reward": 0.1250000037252903,
|
|
"rewards/format_reward": 0.875,
|
|
"step": 340
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1181.7500610351562,
|
|
"epoch": 0.19485714285714287,
|
|
"grad_norm": 0.725435733795166,
|
|
"kl": 0.01190185546875,
|
|
"learning_rate": 1.0181156770214242e-07,
|
|
"loss": -1.4692,
|
|
"reward": 0.5750000476837158,
|
|
"reward_std": 0.15610557794570923,
|
|
"rewards/accuracy_reward": 0.0416666679084301,
|
|
"rewards/format_reward": 0.9166666865348816,
|
|
"step": 341
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 870.9166870117188,
|
|
"epoch": 0.19542857142857142,
|
|
"grad_norm": 1.4129509925842285,
|
|
"kl": 0.017486572265625,
|
|
"learning_rate": 1.0143156386881408e-07,
|
|
"loss": -0.742,
|
|
"reward": 0.7250000834465027,
|
|
"reward_std": 0.21615658700466156,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.9583333432674408,
|
|
"step": 342
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1194.5000610351562,
|
|
"epoch": 0.196,
|
|
"grad_norm": 1.0423340797424316,
|
|
"kl": 0.0213623046875,
|
|
"learning_rate": 1.0109617738307911e-07,
|
|
"loss": -1.4624,
|
|
"reward": 0.6500000357627869,
|
|
"reward_std": 0.24494898319244385,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.9166666865348816,
|
|
"step": 343
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1545.8750610351562,
|
|
"epoch": 0.19657142857142856,
|
|
"grad_norm": 0.3742052912712097,
|
|
"kl": 0.0191650390625,
|
|
"learning_rate": 1.0080544160451918e-07,
|
|
"loss": 0.0031,
|
|
"reward": 0.5500000417232513,
|
|
"reward_std": 0.14339563250541687,
|
|
"rewards/accuracy_reward": 0.1666666716337204,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 344
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1445.9583435058594,
|
|
"epoch": 0.19714285714285715,
|
|
"grad_norm": 1.0773917436599731,
|
|
"kl": 0.019287109375,
|
|
"learning_rate": 1.0055938545148495e-07,
|
|
"loss": -0.5323,
|
|
"reward": 0.7500000596046448,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.5000000298023224,
|
|
"rewards/format_reward": 0.75,
|
|
"step": 345
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1790.5000610351562,
|
|
"epoch": 0.1977142857142857,
|
|
"grad_norm": 0.7913344502449036,
|
|
"kl": 0.0220947265625,
|
|
"learning_rate": 1.0035803339821934e-07,
|
|
"loss": -1.419,
|
|
"reward": 0.550000011920929,
|
|
"reward_std": 0.3209003359079361,
|
|
"rewards/accuracy_reward": 0.1666666679084301,
|
|
"rewards/format_reward": 0.7500000298023224,
|
|
"step": 346
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1382.2916870117188,
|
|
"epoch": 0.1982857142857143,
|
|
"grad_norm": 1.1861852407455444,
|
|
"kl": 0.015045166015625,
|
|
"learning_rate": 1.002014054724235e-07,
|
|
"loss": -1.5114,
|
|
"reward": 0.5750000178813934,
|
|
"reward_std": 0.21615657955408096,
|
|
"rewards/accuracy_reward": 0.0833333358168602,
|
|
"rewards/format_reward": 0.8750000298023224,
|
|
"step": 347
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1589.2500610351562,
|
|
"epoch": 0.19885714285714284,
|
|
"grad_norm": 0.8138798475265503,
|
|
"kl": 0.025634765625,
|
|
"learning_rate": 1.0008951725326441e-07,
|
|
"loss": -1.7143,
|
|
"reward": 0.4750000238418579,
|
|
"reward_std": 0.1596180573105812,
|
|
"rewards/accuracy_reward": 0.0,
|
|
"rewards/format_reward": 0.7916666865348816,
|
|
"step": 348
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1109.5417175292969,
|
|
"epoch": 0.19942857142857143,
|
|
"grad_norm": 0.814863383769989,
|
|
"kl": 0.024444580078125,
|
|
"learning_rate": 1.0002237986982564e-07,
|
|
"loss": -0.7403,
|
|
"reward": 0.7750000357627869,
|
|
"reward_std": 0.13869691640138626,
|
|
"rewards/accuracy_reward": 0.3333333432674408,
|
|
"rewards/format_reward": 0.9583333432674408,
|
|
"step": 349
|
|
},
|
|
{
|
|
"clip_ratio": 0.0,
|
|
"completion_length": 1071.9166870117188,
|
|
"epoch": 0.2,
|
|
"grad_norm": 0.7820748090744019,
|
|
"kl": 0.01959228515625,
|
|
"learning_rate": 1e-07,
|
|
"loss": -1.4839,
|
|
"reward": 0.7000000476837158,
|
|
"reward_std": 0.12247449159622192,
|
|
"rewards/accuracy_reward": 0.25,
|
|
"rewards/format_reward": 0.9166666865348816,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"step": 350,
|
|
"total_flos": 0.0,
|
|
"train_loss": -0.17849066271579692,
|
|
"train_runtime": 2375.2815,
|
|
"train_samples_per_second": 3.536,
|
|
"train_steps_per_second": 0.147
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 350,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|